added GROBID extraction for publications and bs4 parser for xmlized data

mitchbregs · Sep 25, 2019 · 9176625 · 9176625
1 parent c5ad85b
commit 9176625
Show file tree

Hide file tree

Showing 5 changed files with 546 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,17 @@ __pycache__/
 
 # system files
 .DS_Store
+
+# logs
+*.log
+
+# flat-files
+*.json
+*.xml
+*.csv
+
+
+# envs
+venv/
+env/
+virtualenv/
diff --git a/src/data/README.md b/src/data/README.md
@@ -44,3 +44,86 @@ For more information please contact Principal Investigator, Benjamin Ryan (ben_r
 ---
 ## Acknowledgements
 This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.
+
+------
+
+
+## Extraction Tool - GROBID:
+
+
+### Installation:
+
+https://grobid.readthedocs.io/en/latest/Install-Grobid/
+
+
+#### GROBID Client:
+
+Credit to: https://github.com/kermitt2/grobid-client-python
+
+
+### Usage:
+
+```bash
+$ python extract_text.py -h
+
+usage: extract_text.py [-h] [--input INPUT] [--output OUTPUT]
+                       [--config CONFIG] [--n N] [--generateIDs]
+                       [--consolidate_header] [--consolidate_citations]
+                       [--force] [--teiCoordinates]
+                       service
+
+Client for GROBID services
+
+positional arguments:
+  service               one of [processFulltextDocument,
+                        processHeaderDocument, processReferences]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --input INPUT         path to the directory containing PDF to process
+  --output OUTPUT       path to the directory where to put the results
+                        (optional)
+  --config CONFIG       path to the config file, default is ./config.json
+  --n N                 concurrency for service usage
+  --generateIDs         generate random xml:id to textual XML elements of the
+                        result files
+  --consolidate_header  call GROBID with consolidation of the metadata
+                        extracted from the header
+  --consolidate_citations
+                        call GROBID with consolidation of the extracted
+                        bibliographical references
+  --force               force re-processing pdf input files when tei output
+                        files already exist
+  --teiCoordinates      add the original PDF coordinates (bounding boxes) to
+                        the extracted elements
+```
+
+```bash
+## typical usage
+python extract_text.py --input example-dir-with-pdfs/ --output example-dir-to-dump/ --consolidate_header --consolidate_citations processFulltextDocument
+
+```
+
+## Cleaning:
+
+```bash
+
+$ python clean_text.py -h
+
+usage: clean_text.py [-h] --infile INFILE --outdir OUTDIR
+
+Parser for XMLized scholarly publications.
+
+optional arguments:
+  -h, --help       show this help message and exit
+  --infile INFILE  Path to the directory containing XML to process.
+  --outdir OUTDIR  Path to output directory for processed files.
+
+```
+
+```bash
+## typical usage
+python clean_text.py --infile dir-to/something.tei.xml --outdir dir-out/
+
+```
+
diff --git a/src/data/clean_text.py b/src/data/clean_text.py
@@ -0,0 +1,145 @@
+import argparse
+import bs4
+import json
+import os
+
+
+def get_abstract(soup):
+
+    abstract_element = soup.find('abstract').find('p')
+    abstract_text = abstract_element.text if abstract_element is not None else '**NONE**'
+
+    # some cleanup.. this is not smart, rather very naive
+    if 'Abstract' in abstract_text:
+        abstract_text = abstract_text.replace('Abstract', '').strip()
+    if abstract_text.startswith('.'):
+        abstract_text = abstract_text[1:].strip()
+
+    return abstract_text
+
+
+def get_authors(soup):
+
+    author_elements = soup.find_all('author')
+
+    authors = []
+    for author in author_elements:
+        firstname = author.find('forename')
+        lastname = author.find('surname')
+
+        first = firstname.text if firstname else '**NONE**'
+        last = lastname.text if lastname else '**NONE**'
+        out = f'{first} {last}'
+
+        # some cleanup.. this is not smart, rather very naive
+        if out.startswith('&amp;'):
+            out = out.replace('&amp;', '').strip()
+
+        authors.append(out)
+
+    authors_list = list(set(authors))
+
+    return authors_list
+
+
+def get_content(soup):
+
+    paragraph_elements = soup.find_all('p')
+
+    paragraphs_list = []
+    for para in paragraph_elements:
+        paragraphs_list.append(para.text)
+
+    return paragraphs_list
+
+
+def get_references(soup):
+
+    reference_elements = soup.find_all('ref')
+
+    references_list = []
+    for ref in reference_elements:
+        references_list.append(ref.text)
+
+    return references_list
+
+
+def get_formulas(soup):
+
+    formula_elements = soup.find_all('formula')
+
+    formulas_list = []
+    for formula in formula_elements:
+        formulas_list.append(formula.text)
+
+    return formulas_list
+
+
+def run(args_dict):
+
+    fin = open(args_dict['infile'], 'r')
+    content = fin.read()
+    fin.close()
+
+    soup = bs4.BeautifulSoup(content, 'xml')
+
+    tmp_content = get_content(soup)
+    content = ' '.join(tmp_content)
+    raw_content = '////'.join(tmp_content)
+
+    abstract = get_abstract(soup)
+    authors = get_authors(soup)
+    references = get_references(soup)
+    formulas = get_formulas(soup)
+
+    # need to strip all the fluff from the
+    # cleaned content
+    for ref in references:
+        content = content.replace(ref, '')
+
+    for frm in formulas:
+        content = content.replace(frm, '')
+
+    for atr in authors:
+        content = content.replace(atr, '')
+
+    content = content.replace(abstract, '')
+
+    data = {
+        'abstract': abstract,
+        'authors': authors,
+        'clean_content': content,
+        'raw_content': raw_content,
+        'references': references,
+        'formulas': formulas
+    }
+
+    outdir = os.path.abspath(args_dict['outdir'])
+
+    tmp = args_dict['infile'].split('/')[-1]
+    fname = f'{tmp}.json'
+    fout = f'{outdir}/{fname}'
+
+    f = open(fout, 'w')
+    json.dump(data, f)
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parser for XMLized scholarly publications."
+    )
+    parser.add_argument(
+        "--infile",
+        required=True,
+        help="Path to the directory containing XML to process."
+    )
+    parser.add_argument(
+        "--outdir",
+        required=True,
+        help="Path to output directory for processed files."
+    )
+
+    args_dict = vars(parser.parse_args())
+
+    run(args_dict)