Skip to content

Commit

Permalink
Merge pull request #27 from Shreyanand/demo2
Browse files Browse the repository at this point in the history
Add text extraction notebook and other demo2 files
  • Loading branch information
MichaelClifford authored Oct 11, 2021
2 parents 70c9135 + d61522c commit fbb75d6
Show file tree
Hide file tree
Showing 24 changed files with 3,246 additions and 106 deletions.
9 changes: 9 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ trino = "*"
pandas = "*"
pyarrow = "*"
python-dotenv = "*"
pdf2image = "*"
mmdet = "*"
mmcv-full = "*"
torch = "*"
torchvision = "*"
tabula-py = "*"
gdown = "*"
pdfminer.six = "*"
fuzzywuzzy = "*"
python-Levenshtein = "*"

[requires]
Expand Down
643 changes: 538 additions & 105 deletions Pipfile.lock

Large diffs are not rendered by default.

Binary file not shown.
665 changes: 665 additions & 0 deletions data/extraction/esg_TEXT_dataset.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions data/extraction/sustainability-report-2019.json

Large diffs are not rendered by default.

Binary file added data/pdfs/ESG/sustainability-report-2019.pdf
Binary file not shown.
100 changes: 100 additions & 0 deletions notebooks/demo2/pdf_text_extraction.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text Extraction\n",
"As a first step of the pipeline, we aim to extract text from PDFs in this notebook. The input PDFs for this notebook is in the `ROOT/data/pdfs` directory and the output json will be stored in `ROOT/data/extract` directory. The output from this notebook combined with the annotations will be used in the next step of curation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Author: ALLIANZ NLP esg data pipeline\n",
"from src.components.preprocessing import Extractor\n",
"import src.components.config as config"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"PDF_FOLDER = config.BASE_PDF_FOLDER\n",
"EXT_FOLDER = config.BASE_EXTRACTION_FOLDER"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Call text extracter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"PDFTextExtractor_kwargs = {\n",
" \"min_paragraph_length\": 30,\n",
" \"annotation_folder\": None,\n",
" \"skip_extracted_files\": False,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['/home/shanand/aicoe-osc-demo/data/pdfs/ESG/sustainability-report-2019.pdf']\n"
]
}
],
"source": [
"ext = Extractor([(\"PDFTextExtractor\", PDFTextExtractor_kwargs)])\n",
"ext.run_folder(PDF_FOLDER, EXT_FOLDER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Conclusion\n",
"We called the Extractor class to extract text from the PDF and store the ouput in the `ROOT/data/extraction` folder."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
1 change: 0 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
"""Analytics compiled into python code."""
92 changes: 92 additions & 0 deletions src/components/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import src
import pathlib
import torch

# General config
STAGE = "extract" # "extract" | "curate "
SEED = 42

ROOT = pathlib.Path(src.__file__).resolve().parent.parent
CONFIG_FOLDER = ROOT / "config"
CHECKPOINT_FOLDER = ROOT / "checkpoint"
# the data for demo notebooks is located at sample_data directory
DATA_FOLDER = ROOT / "data"
BASE_PDF_FOLDER = DATA_FOLDER / "pdfs"
BASE_ANNOTATION_FOLDER = DATA_FOLDER / "annotations"
BASE_EXTRACTION_FOLDER = DATA_FOLDER / "extraction"
BASE_CURATION_FOLDER = DATA_FOLDER / "curation"


if not os.path.exists(BASE_EXTRACTION_FOLDER):
os.mkdir(BASE_EXTRACTION_FOLDER)
if not os.path.exists(BASE_CURATION_FOLDER):
os.mkdir(BASE_CURATION_FOLDER)

ckpt = "icdar_19b2_v2.pth" if "cpu" in torch.__version__ else "icdar_19b2.pth"
config_file = (
"cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py"
if "cpu" in torch.__version__
else "cascade_mask_rcnn_hrnetv2p_w32_20e.py"
)
PDFTableExtractor_kwargs = {
"batch_size": -1,
"cscdtabnet_config": CONFIG_FOLDER / config_file,
"cscdtabnet_ckpt": CHECKPOINT_FOLDER / ckpt,
"bbox_thres": 0.85,
"dpi": 200,
}

# PDFTextExtractor
PDFTextExtractor_kwargs = {
"min_paragraph_length": 30,
# Set to ANNOTATION_FOLDER if you want to extract just pdfs mentioned in the annotations
# Set to None to extract all pdfs in pdf folder (for production stage)
"annotation_folder": None,
"skip_extracted_files": False,
}

TableCurator_kwargs = {
"neg_pos_ratio": 1,
"create_neg_samples": True,
"columns_to_read": [
"company",
"source_file",
"source_page",
"kpi_id",
"year",
"answer",
"data_type",
],
"company_to_exclude": ["CEZ"],
"seed": SEED,
}

TextCurator_kwargs = {
"retrieve_paragraph": False,
"neg_pos_ratio": 1,
"columns_to_read": [
"company",
"source_file",
"source_page",
"kpi_id",
"year",
"answer",
"data_type",
"relevant_paragraphs",
],
"company_to_exclude": [],
"create_neg_samples": True,
"seed": SEED,
}

# Components
EXTRACTORS = [
# ("PDFTableExtractor", PDFTableExtractor_kwargs),
("PDFTextExtractor", PDFTextExtractor_kwargs)
]

CURATORS = [
("TextCurator", TextCurator_kwargs)
# ,("TableCurator", TableCurator_kwargs)
]
9 changes: 9 additions & 0 deletions src/components/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .base_component import BaseComponent # noqa F401
from .pdf_table_extractor import PDFTableExtractor # noqa F401
from .pdf_text_extractor import PDFTextExtractor # noqa F401
from .extractor import Extractor # noqa F401
from .nq_extractor import NQExtractor # noqa F401
from .nq_curator import NQCurator # noqa F401
from .curator import Curator # noqa F401
from .text_curator import TextCurator # noqa F401
from .table_curator import TableCurator # noqa F401
10 changes: 10 additions & 0 deletions src/components/preprocessing/base_component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from abc import ABC, abstractmethod


class BaseComponent(ABC):
def __init__(self, name="Base"):
self.name = name

@abstractmethod
def run(self, *args, **kwargs):
pass
42 changes: 42 additions & 0 deletions src/components/preprocessing/base_curator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import re
from abc import abstractmethod

from .base_component import BaseComponent


# Remember to also implement BaseComponent's abstract methods for child classes
# of this class
class BaseCurator(BaseComponent):
def __init__(self, name="BaseCurator"):
self.name = name

@abstractmethod
def process_single_annotation_file(self, annotation_filepath, *args, **kwargs):
pass

@abstractmethod
def create_pos_examples(self, row, *args, **kwargs):
pass

@abstractmethod
def create_negative_examples(self, row, *args, **kwargs):
pass

@staticmethod
def clean_text(text):
"""
Clean text
Args:
text (A str)
"""
# Substitute unusual quotes at the start of the string with usual quotes
text = re.sub("(?<=\[)“", '"', text) # noqa W605
# Substitute unusual quotes at the end of the string with usual quotes
text = re.sub("”(?=\])", '"', text) # noqa W605
# Substitute th remaining unusual quotes with space
text = re.sub("“|”", "", text)
text = re.sub("\n|\t", " ", text)
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]", "", text)
text = re.sub(r"\s{2,}", " ", text)
return text
50 changes: 50 additions & 0 deletions src/components/preprocessing/curator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import glob
import logging

from .text_curator import TextCurator
from .table_curator import TableCurator

logger = logging.getLogger(__name__)
NAME_CLASS_MAPPING = {"TextCurator": TextCurator, "TableCurator": TableCurator}


class Curator:
"""A data curator component responsible for creating table and text training data based on annotated data
Args:
annotation_folder (str): path to the folder containing annotation excel files
"""

def __init__(self, curators):
self.curators = self.__create_curators(curators)

def __create_curators(self, curators):
"""
Returns a list of curator objects
Args:
curators (A list of str)
"""
list_cura = []
for cura in curators:
try:
cura_obj = NAME_CLASS_MAPPING[cura[0]](**cura[1])
except KeyError:
raise ValueError("{} is an invalid extractor".format(cura[0]))

list_cura.append(cura_obj)

return list_cura

def run(self, input_extraction_folder, annotation_folder, output_folder):
"""Runs curation for each curator.
Args:
input_extraction_folder (A str or PosixPath)
annotation_folder (A str or PosixPath)
output_folder (A str or PosixPath)
"""
annotation_excels = glob.glob("{}/[!~$]*[.xlsx]".format(annotation_folder))
logger.info("Received {} excel files".format(len(annotation_excels)))

for curator_obj in self.curators:
curator_obj.run(input_extraction_folder, annotation_excels, output_folder)
63 changes: 63 additions & 0 deletions src/components/preprocessing/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from .pdf_table_extractor import PDFTableExtractor
from .pdf_text_extractor import PDFTextExtractor
import logging

_logger = logging.getLogger(__name__)
NAME_CLASS_MAPPING = {
"PDFTextExtractor": PDFTextExtractor,
"PDFTableExtractor": PDFTableExtractor,
}


class Extractor:
def __init__(self, extractors):
"""
A pipeline extractor which combines different types of extractors
Args:
extractors (A list of tuples): (Name of extractor, kwargs_dict)
"""
self.extractors = self.__create_extractors(extractors)

def __create_extractors(self, extractors):
"""Returns a list of extractors objects
Args:
extractors (A list of str)
"""
list_ext = []
for ext in extractors:
try:
ext_obj = NAME_CLASS_MAPPING[ext[0]](**ext[1])
except KeyError:
raise ValueError("{} is an invalid extractor".format(ext[0]))

list_ext.append(ext_obj)

return list_ext

def run(self, input_filepath, output_folder):
"""
Extract a single file
Args:
input_filepath (str): Input file path
output_folder (str): Output folder path
"""
_logger.info("Running all extractors...")

for ext in self.extractors:
_ = ext.run(input_filepath, output_folder)

def run_folder(self, input_folder, output_folder):
"""
Extract for all files mentioned in folder.
(The logic is based on each child.)
Args:
input_folder (A str): Input folder path
output_folder (A str): Output folder path
"""
for ext in self.extractors:
ext.run_folder(input_folder, output_folder)
Loading

0 comments on commit fbb75d6

Please sign in to comment.