Extract footnotes and in-text citations from several word documents (.docx) and show similarities

Try extracting footnotes & citations from ftp://ftp.prcity.com/OBSP

In [20]:
# Read in packages for program
from difflib import SequenceMatcher
from lxml import etree
import pandas as pd
import zipfile
import string
import re
In [21]:
# Read zip container and retrieve xml
# Input file name and component ('document' or 'footnotes')
def get_word_xml(docx_filename,component):
    with open(docx_filename) as f:
        zip = zipfile.ZipFile(f)
        string='word/%s.xml'%(component)
        xml_content = zip.read(string)
    return xml_content

# Parse string into usable tree structure
def get_xml_tree(xml_string):
    return etree.fromstring(xml_string)

# Iterate through all nodes in tree and yield node, node text, and node type
def _itertext(my_etree,component):
    if (component=='footnote'):
        for node in my_etree.iter(tag=etree.Element):
            if _check_element_is(node, 'footnote'):
                yield (node, node.text,'footnote')
            elif _check_element_is(node, 't'):
                yield (node, node.text,'t')
    else:
        for node in my_etree.iter(tag=etree.Element):
            if _check_element_is(node, 't'):
                yield (node, node.text)
def _check_element_is(element, type_char):
    word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    return element.tag == '{%s}%s' % (word_schema,type_char)

# List all footnotes given xml tree
def get_footnotes(xml_tree):
    container=[]
    footnotes=[]

    for node, txt,types in _itertext(xml_tree,'footnote'):
        if (types=='t'):
            try:
                subtext=filter(lambda x: x in printable, txt)
                container.append(subtext)
            except: 
                pass
        elif (types=='footnote') & (not container):
            pass
        else:
            footnotes.append(str(''.join(container)))
            container=[]

    footnotes.append(str(''.join(container)))
    return footnotes

# List all ciations given xml tree
def get_citations(xml_tree):
    printable = set(string.printable)
    text=[]

    for node, txt in _itertext(xml_tree,'other'):
        try:
            subtext=filter(lambda x: x in printable, txt)
            text.append(subtext)
        except: 
            pass
    text=str(' '.join(text))

    text_citations=re.findall("\([a-zA-Z .]+,\s*\d{4}\)", text)
    return text_citations

# Match strings that look similar
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Generate dataframe showing id, unique footnote or in-text citation, and option for similar ids (True or False)
def generate_dataframe(string_list, sections_dict, similar_option,section_option):
    
    string_list=list(set(string_list))
    string_list=sorted(string_list, key=str.lower)
    
    if similar_option:
        clean_list=[]
        for i in string_list:
            value=i.replace("/"," ")
            clean_list.append(value)

        temp=[]
        similar_matches={}
        for i,ovalue in enumerate(clean_list):
            for j,ivalue in enumerate(clean_list):
                if i!=j:
                    result=similar(ovalue,ivalue)
                    if (result>0.6):
                        temp.append(j)
            similar_matches[ovalue]=str(temp).strip("[]")
            temp=[]

        df=pd.DataFrame({'id':range(len(string_list)),'text':string_list,'clean_text':clean_list})

        df['similar_index']=df['clean_text'].map(similar_matches)
        if section_option:
            df['sections']=df['text'].map(sections_dict)
            df['sections']=df['sections'].map(lambda x:', '.join(x))

        df.drop('clean_text',axis=1,inplace=True)
    else:
        df=pd.DataFrame({'id':range(len(string_list)),'text':string_list})
        if section_option:
            df['sections']=df['text'].map(sections_dict)
            df['sections']=df['sections'].map(lambda x:', '.join(x))
    return df
In [33]:
# Add section names to dataframe
# Sections must have footnotes
section_names=['Geology', 'Biology', 'PD', 'Hazardous','AQ']
files=['./documents/ORSP_adeir 5.1 Geology.docx','./documents/ORSP_adeir 5.3 Biological Resources.docx',
       './documents/ORSP_adeir 3.0 ProjectDescription.docx', './documents/ORSP_adeir 5.5 Hazardous Mat.docx', 
        './documents/ORSP_adeir 5.7 Air Quality.docx']
printable = set(string.printable)
footnote_list=[]
sections={}

for index, f in enumerate(files): 
    xml_from_file = get_word_xml(f,'footnotes')
    xml_tree = get_xml_tree(xml_from_file)
    temp=get_footnotes(xml_tree)
    for footnote in list(set(temp)):
        try:
            sections.setdefault(footnote,[]).append(section_names[index])
        except:
            pass
    footnote_list.extend(temp)
    
df_footnotes=generate_dataframe(footnote_list,sections,True,True)

writer = pd.ExcelWriter('unique_entries.xlsx')
df_footnotes.to_excel(writer,'df',index=False)
writer.save()
In [32]:
# Note: This will not find citations in footnotes or in figures
section_names=['Geology', 'Biology', 'PD', 'Hazardous','AQ']
files=['./documents/ORSP_adeir 5.1 Geology.docx','./documents/ORSP_adeir 5.3 Biological Resources.docx',
       './documents/ORSP_adeir 3.0 ProjectDescription.docx', './documents/ORSP_adeir 5.5 Hazardous Mat.docx', 
        './documents/ORSP_adeir 5.7 Air Quality.docx']
printable = set(string.printable)
citation_list=[]
sections={}
for f in files:
        xml_from_file = get_word_xml(f,'document')
        xml_tree = get_xml_tree(xml_from_file)
        temp=get_citations(xml_tree)
        for citation in list(set(temp)):
            try:
                sections.setdefault(citation,[]).append(section_names[index])
            except:
                pass
        citation_list.extend(temp)

df_citations=generate_dataframe(citation_list,sections, False,True)

writer = pd.ExcelWriter('unique_entries.xlsx')
df_citations.to_excel(writer,'df',index=False)
writer.save()
In [ ]: