Skip to content

Commit

Permalink
added GROBID extraction for publications and bs4 parser for xmlized data
Browse files Browse the repository at this point in the history
  • Loading branch information
mitchbregs committed Sep 25, 2019
1 parent c5ad85b commit 9176625
Show file tree
Hide file tree
Showing 5 changed files with 546 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,17 @@ __pycache__/

# system files
.DS_Store

# logs
*.log

# flat-files
*.json
*.xml
*.csv


# envs
venv/
env/
virtualenv/
83 changes: 83 additions & 0 deletions src/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,86 @@ For more information please contact Principal Investigator, Benjamin Ryan (ben_r
---
## Acknowledgements
This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.

------


## Extraction Tool - GROBID:


### Installation:

https://grobid.readthedocs.io/en/latest/Install-Grobid/


#### GROBID Client:

Credit to: https://github.com/kermitt2/grobid-client-python


### Usage:

```bash
$ python extract_text.py -h

usage: extract_text.py [-h] [--input INPUT] [--output OUTPUT]
[--config CONFIG] [--n N] [--generateIDs]
[--consolidate_header] [--consolidate_citations]
[--force] [--teiCoordinates]
service

Client for GROBID services

positional arguments:
service one of [processFulltextDocument,
processHeaderDocument, processReferences]

optional arguments:
-h, --help show this help message and exit
--input INPUT path to the directory containing PDF to process
--output OUTPUT path to the directory where to put the results
(optional)
--config CONFIG path to the config file, default is ./config.json
--n N concurrency for service usage
--generateIDs generate random xml:id to textual XML elements of the
result files
--consolidate_header call GROBID with consolidation of the metadata
extracted from the header
--consolidate_citations
call GROBID with consolidation of the extracted
bibliographical references
--force force re-processing pdf input files when tei output
files already exist
--teiCoordinates add the original PDF coordinates (bounding boxes) to
the extracted elements
```

```bash
## typical usage
python extract_text.py --input example-dir-with-pdfs/ --output example-dir-to-dump/ --consolidate_header --consolidate_citations processFulltextDocument

```

## Cleaning:

```bash

$ python clean_text.py -h

usage: clean_text.py [-h] --infile INFILE --outdir OUTDIR

Parser for XMLized scholarly publications.

optional arguments:
-h, --help show this help message and exit
--infile INFILE Path to the directory containing XML to process.
--outdir OUTDIR Path to output directory for processed files.

```

```bash
## typical usage
python clean_text.py --infile dir-to/something.tei.xml --outdir dir-out/

```

145 changes: 145 additions & 0 deletions src/data/clean_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import argparse
import bs4
import json
import os


def get_abstract(soup):

abstract_element = soup.find('abstract').find('p')
abstract_text = abstract_element.text if abstract_element is not None else '**NONE**'

# some cleanup.. this is not smart, rather very naive
if 'Abstract' in abstract_text:
abstract_text = abstract_text.replace('Abstract', '').strip()
if abstract_text.startswith('.'):
abstract_text = abstract_text[1:].strip()

return abstract_text


def get_authors(soup):

author_elements = soup.find_all('author')

authors = []
for author in author_elements:
firstname = author.find('forename')
lastname = author.find('surname')

first = firstname.text if firstname else '**NONE**'
last = lastname.text if lastname else '**NONE**'
out = f'{first} {last}'

# some cleanup.. this is not smart, rather very naive
if out.startswith('&'):
out = out.replace('&', '').strip()

authors.append(out)

authors_list = list(set(authors))

return authors_list


def get_content(soup):

paragraph_elements = soup.find_all('p')

paragraphs_list = []
for para in paragraph_elements:
paragraphs_list.append(para.text)

return paragraphs_list


def get_references(soup):

reference_elements = soup.find_all('ref')

references_list = []
for ref in reference_elements:
references_list.append(ref.text)

return references_list


def get_formulas(soup):

formula_elements = soup.find_all('formula')

formulas_list = []
for formula in formula_elements:
formulas_list.append(formula.text)

return formulas_list


def run(args_dict):

fin = open(args_dict['infile'], 'r')
content = fin.read()
fin.close()

soup = bs4.BeautifulSoup(content, 'xml')

tmp_content = get_content(soup)
content = ' '.join(tmp_content)
raw_content = '////'.join(tmp_content)

abstract = get_abstract(soup)
authors = get_authors(soup)
references = get_references(soup)
formulas = get_formulas(soup)

# need to strip all the fluff from the
# cleaned content
for ref in references:
content = content.replace(ref, '')

for frm in formulas:
content = content.replace(frm, '')

for atr in authors:
content = content.replace(atr, '')

content = content.replace(abstract, '')

data = {
'abstract': abstract,
'authors': authors,
'clean_content': content,
'raw_content': raw_content,
'references': references,
'formulas': formulas
}

outdir = os.path.abspath(args_dict['outdir'])

tmp = args_dict['infile'].split('/')[-1]
fname = f'{tmp}.json'
fout = f'{outdir}/{fname}'

f = open(fout, 'w')
json.dump(data, f)
f.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parser for XMLized scholarly publications."
)
parser.add_argument(
"--infile",
required=True,
help="Path to the directory containing XML to process."
)
parser.add_argument(
"--outdir",
required=True,
help="Path to output directory for processed files."
)

args_dict = vars(parser.parse_args())

run(args_dict)
Loading

0 comments on commit 9176625

Please sign in to comment.