Try extracting footnotes & citations from ftp://ftp.prcity.com/OBSP
# Read in packages for program
from difflib import SequenceMatcher
from lxml import etree
import pandas as pd
import zipfile
import string
import re
# Read zip container and retrieve xml
# Input file name and component ('document' or 'footnotes')
def get_word_xml(docx_filename,component):
with open(docx_filename) as f:
zip = zipfile.ZipFile(f)
string='word/%s.xml'%(component)
xml_content = zip.read(string)
return xml_content
# Parse string into usable tree structure
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
# Iterate through all nodes in tree and yield node, node text, and node type
def _itertext(my_etree,component):
if (component=='footnote'):
for node in my_etree.iter(tag=etree.Element):
if _check_element_is(node, 'footnote'):
yield (node, node.text,'footnote')
elif _check_element_is(node, 't'):
yield (node, node.text,'t')
else:
for node in my_etree.iter(tag=etree.Element):
if _check_element_is(node, 't'):
yield (node, node.text)
def _check_element_is(element, type_char):
word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
return element.tag == '{%s}%s' % (word_schema,type_char)
# List all footnotes given xml tree
def get_footnotes(xml_tree):
container=[]
footnotes=[]
for node, txt,types in _itertext(xml_tree,'footnote'):
if (types=='t'):
try:
subtext=filter(lambda x: x in printable, txt)
container.append(subtext)
except:
pass
elif (types=='footnote') & (not container):
pass
else:
footnotes.append(str(''.join(container)))
container=[]
footnotes.append(str(''.join(container)))
return footnotes
# List all ciations given xml tree
def get_citations(xml_tree):
printable = set(string.printable)
text=[]
for node, txt in _itertext(xml_tree,'other'):
try:
subtext=filter(lambda x: x in printable, txt)
text.append(subtext)
except:
pass
text=str(' '.join(text))
text_citations=re.findall("\([a-zA-Z .]+,\s*\d{4}\)", text)
return text_citations
# Match strings that look similar
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
# Generate dataframe showing id, unique footnote or in-text citation, and option for similar ids (True or False)
def generate_dataframe(string_list, sections_dict, similar_option,section_option):
string_list=list(set(string_list))
string_list=sorted(string_list, key=str.lower)
if similar_option:
clean_list=[]
for i in string_list:
value=i.replace("/"," ")
clean_list.append(value)
temp=[]
similar_matches={}
for i,ovalue in enumerate(clean_list):
for j,ivalue in enumerate(clean_list):
if i!=j:
result=similar(ovalue,ivalue)
if (result>0.6):
temp.append(j)
similar_matches[ovalue]=str(temp).strip("[]")
temp=[]
df=pd.DataFrame({'id':range(len(string_list)),'text':string_list,'clean_text':clean_list})
df['similar_index']=df['clean_text'].map(similar_matches)
if section_option:
df['sections']=df['text'].map(sections_dict)
df['sections']=df['sections'].map(lambda x:', '.join(x))
df.drop('clean_text',axis=1,inplace=True)
else:
df=pd.DataFrame({'id':range(len(string_list)),'text':string_list})
if section_option:
df['sections']=df['text'].map(sections_dict)
df['sections']=df['sections'].map(lambda x:', '.join(x))
return df
# Add section names to dataframe
# Sections must have footnotes
section_names=['Geology', 'Biology', 'PD', 'Hazardous','AQ']
files=['./documents/ORSP_adeir 5.1 Geology.docx','./documents/ORSP_adeir 5.3 Biological Resources.docx',
'./documents/ORSP_adeir 3.0 ProjectDescription.docx', './documents/ORSP_adeir 5.5 Hazardous Mat.docx',
'./documents/ORSP_adeir 5.7 Air Quality.docx']
printable = set(string.printable)
footnote_list=[]
sections={}
for index, f in enumerate(files):
xml_from_file = get_word_xml(f,'footnotes')
xml_tree = get_xml_tree(xml_from_file)
temp=get_footnotes(xml_tree)
for footnote in list(set(temp)):
try:
sections.setdefault(footnote,[]).append(section_names[index])
except:
pass
footnote_list.extend(temp)
df_footnotes=generate_dataframe(footnote_list,sections,True,True)
writer = pd.ExcelWriter('unique_entries.xlsx')
df_footnotes.to_excel(writer,'df',index=False)
writer.save()
# Note: This will not find citations in footnotes or in figures
section_names=['Geology', 'Biology', 'PD', 'Hazardous','AQ']
files=['./documents/ORSP_adeir 5.1 Geology.docx','./documents/ORSP_adeir 5.3 Biological Resources.docx',
'./documents/ORSP_adeir 3.0 ProjectDescription.docx', './documents/ORSP_adeir 5.5 Hazardous Mat.docx',
'./documents/ORSP_adeir 5.7 Air Quality.docx']
printable = set(string.printable)
citation_list=[]
sections={}
for f in files:
xml_from_file = get_word_xml(f,'document')
xml_tree = get_xml_tree(xml_from_file)
temp=get_citations(xml_tree)
for citation in list(set(temp)):
try:
sections.setdefault(citation,[]).append(section_names[index])
except:
pass
citation_list.extend(temp)
df_citations=generate_dataframe(citation_list,sections, False,True)
writer = pd.ExcelWriter('unique_entries.xlsx')
df_citations.to_excel(writer,'df',index=False)
writer.save()