generated from aicoe-aiops/project-template
-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from Shreyanand/demo2
Add text extraction notebook and other demo2 files
- Loading branch information
Showing
24 changed files
with
3,246 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+331 KB
data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Text Extraction\n", | ||
"As a first step of the pipeline, we aim to extract text from PDFs in this notebook. The input PDFs for this notebook is in the `ROOT/data/pdfs` directory and the output json will be stored in `ROOT/data/extract` directory. The output from this notebook combined with the annotations will be used in the next step of curation." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Author: ALLIANZ NLP esg data pipeline\n", | ||
"from src.components.preprocessing import Extractor\n", | ||
"import src.components.config as config" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"PDF_FOLDER = config.BASE_PDF_FOLDER\n", | ||
"EXT_FOLDER = config.BASE_EXTRACTION_FOLDER" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Call text extracter" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"PDFTextExtractor_kwargs = {\n", | ||
" \"min_paragraph_length\": 30,\n", | ||
" \"annotation_folder\": None,\n", | ||
" \"skip_extracted_files\": False,\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['/home/shanand/aicoe-osc-demo/data/pdfs/ESG/sustainability-report-2019.pdf']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"ext = Extractor([(\"PDFTextExtractor\", PDFTextExtractor_kwargs)])\n", | ||
"ext.run_folder(PDF_FOLDER, EXT_FOLDER)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Conclusion\n", | ||
"We called the Extractor class to extract text from the PDF and store the ouput in the `ROOT/data/extraction` folder." | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +0,0 @@ | ||
"""Analytics compiled into python code.""" | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import os | ||
import src | ||
import pathlib | ||
import torch | ||
|
||
# General config | ||
STAGE = "extract" # "extract" | "curate " | ||
SEED = 42 | ||
|
||
ROOT = pathlib.Path(src.__file__).resolve().parent.parent | ||
CONFIG_FOLDER = ROOT / "config" | ||
CHECKPOINT_FOLDER = ROOT / "checkpoint" | ||
# the data for demo notebooks is located at sample_data directory | ||
DATA_FOLDER = ROOT / "data" | ||
BASE_PDF_FOLDER = DATA_FOLDER / "pdfs" | ||
BASE_ANNOTATION_FOLDER = DATA_FOLDER / "annotations" | ||
BASE_EXTRACTION_FOLDER = DATA_FOLDER / "extraction" | ||
BASE_CURATION_FOLDER = DATA_FOLDER / "curation" | ||
|
||
|
||
if not os.path.exists(BASE_EXTRACTION_FOLDER): | ||
os.mkdir(BASE_EXTRACTION_FOLDER) | ||
if not os.path.exists(BASE_CURATION_FOLDER): | ||
os.mkdir(BASE_CURATION_FOLDER) | ||
|
||
ckpt = "icdar_19b2_v2.pth" if "cpu" in torch.__version__ else "icdar_19b2.pth" | ||
config_file = ( | ||
"cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py" | ||
if "cpu" in torch.__version__ | ||
else "cascade_mask_rcnn_hrnetv2p_w32_20e.py" | ||
) | ||
PDFTableExtractor_kwargs = { | ||
"batch_size": -1, | ||
"cscdtabnet_config": CONFIG_FOLDER / config_file, | ||
"cscdtabnet_ckpt": CHECKPOINT_FOLDER / ckpt, | ||
"bbox_thres": 0.85, | ||
"dpi": 200, | ||
} | ||
|
||
# PDFTextExtractor | ||
PDFTextExtractor_kwargs = { | ||
"min_paragraph_length": 30, | ||
# Set to ANNOTATION_FOLDER if you want to extract just pdfs mentioned in the annotations | ||
# Set to None to extract all pdfs in pdf folder (for production stage) | ||
"annotation_folder": None, | ||
"skip_extracted_files": False, | ||
} | ||
|
||
TableCurator_kwargs = { | ||
"neg_pos_ratio": 1, | ||
"create_neg_samples": True, | ||
"columns_to_read": [ | ||
"company", | ||
"source_file", | ||
"source_page", | ||
"kpi_id", | ||
"year", | ||
"answer", | ||
"data_type", | ||
], | ||
"company_to_exclude": ["CEZ"], | ||
"seed": SEED, | ||
} | ||
|
||
TextCurator_kwargs = { | ||
"retrieve_paragraph": False, | ||
"neg_pos_ratio": 1, | ||
"columns_to_read": [ | ||
"company", | ||
"source_file", | ||
"source_page", | ||
"kpi_id", | ||
"year", | ||
"answer", | ||
"data_type", | ||
"relevant_paragraphs", | ||
], | ||
"company_to_exclude": [], | ||
"create_neg_samples": True, | ||
"seed": SEED, | ||
} | ||
|
||
# Components | ||
EXTRACTORS = [ | ||
# ("PDFTableExtractor", PDFTableExtractor_kwargs), | ||
("PDFTextExtractor", PDFTextExtractor_kwargs) | ||
] | ||
|
||
CURATORS = [ | ||
("TextCurator", TextCurator_kwargs) | ||
# ,("TableCurator", TableCurator_kwargs) | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from .base_component import BaseComponent # noqa F401 | ||
from .pdf_table_extractor import PDFTableExtractor # noqa F401 | ||
from .pdf_text_extractor import PDFTextExtractor # noqa F401 | ||
from .extractor import Extractor # noqa F401 | ||
from .nq_extractor import NQExtractor # noqa F401 | ||
from .nq_curator import NQCurator # noqa F401 | ||
from .curator import Curator # noqa F401 | ||
from .text_curator import TextCurator # noqa F401 | ||
from .table_curator import TableCurator # noqa F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
|
||
class BaseComponent(ABC): | ||
def __init__(self, name="Base"): | ||
self.name = name | ||
|
||
@abstractmethod | ||
def run(self, *args, **kwargs): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import re | ||
from abc import abstractmethod | ||
|
||
from .base_component import BaseComponent | ||
|
||
|
||
# Remember to also implement BaseComponent's abstract methods for child classes | ||
# of this class | ||
class BaseCurator(BaseComponent): | ||
def __init__(self, name="BaseCurator"): | ||
self.name = name | ||
|
||
@abstractmethod | ||
def process_single_annotation_file(self, annotation_filepath, *args, **kwargs): | ||
pass | ||
|
||
@abstractmethod | ||
def create_pos_examples(self, row, *args, **kwargs): | ||
pass | ||
|
||
@abstractmethod | ||
def create_negative_examples(self, row, *args, **kwargs): | ||
pass | ||
|
||
@staticmethod | ||
def clean_text(text): | ||
""" | ||
Clean text | ||
Args: | ||
text (A str) | ||
""" | ||
# Substitute unusual quotes at the start of the string with usual quotes | ||
text = re.sub("(?<=\[)“", '"', text) # noqa W605 | ||
# Substitute unusual quotes at the end of the string with usual quotes | ||
text = re.sub("”(?=\])", '"', text) # noqa W605 | ||
# Substitute th remaining unusual quotes with space | ||
text = re.sub("“|”", "", text) | ||
text = re.sub("\n|\t", " ", text) | ||
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]", "", text) | ||
text = re.sub(r"\s{2,}", " ", text) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import glob | ||
import logging | ||
|
||
from .text_curator import TextCurator | ||
from .table_curator import TableCurator | ||
|
||
logger = logging.getLogger(__name__) | ||
NAME_CLASS_MAPPING = {"TextCurator": TextCurator, "TableCurator": TableCurator} | ||
|
||
|
||
class Curator: | ||
"""A data curator component responsible for creating table and text training data based on annotated data | ||
Args: | ||
annotation_folder (str): path to the folder containing annotation excel files | ||
""" | ||
|
||
def __init__(self, curators): | ||
self.curators = self.__create_curators(curators) | ||
|
||
def __create_curators(self, curators): | ||
""" | ||
Returns a list of curator objects | ||
Args: | ||
curators (A list of str) | ||
""" | ||
list_cura = [] | ||
for cura in curators: | ||
try: | ||
cura_obj = NAME_CLASS_MAPPING[cura[0]](**cura[1]) | ||
except KeyError: | ||
raise ValueError("{} is an invalid extractor".format(cura[0])) | ||
|
||
list_cura.append(cura_obj) | ||
|
||
return list_cura | ||
|
||
def run(self, input_extraction_folder, annotation_folder, output_folder): | ||
"""Runs curation for each curator. | ||
Args: | ||
input_extraction_folder (A str or PosixPath) | ||
annotation_folder (A str or PosixPath) | ||
output_folder (A str or PosixPath) | ||
""" | ||
annotation_excels = glob.glob("{}/[!~$]*[.xlsx]".format(annotation_folder)) | ||
logger.info("Received {} excel files".format(len(annotation_excels))) | ||
|
||
for curator_obj in self.curators: | ||
curator_obj.run(input_extraction_folder, annotation_excels, output_folder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from .pdf_table_extractor import PDFTableExtractor | ||
from .pdf_text_extractor import PDFTextExtractor | ||
import logging | ||
|
||
_logger = logging.getLogger(__name__) | ||
NAME_CLASS_MAPPING = { | ||
"PDFTextExtractor": PDFTextExtractor, | ||
"PDFTableExtractor": PDFTableExtractor, | ||
} | ||
|
||
|
||
class Extractor: | ||
def __init__(self, extractors): | ||
""" | ||
A pipeline extractor which combines different types of extractors | ||
Args: | ||
extractors (A list of tuples): (Name of extractor, kwargs_dict) | ||
""" | ||
self.extractors = self.__create_extractors(extractors) | ||
|
||
def __create_extractors(self, extractors): | ||
"""Returns a list of extractors objects | ||
Args: | ||
extractors (A list of str) | ||
""" | ||
list_ext = [] | ||
for ext in extractors: | ||
try: | ||
ext_obj = NAME_CLASS_MAPPING[ext[0]](**ext[1]) | ||
except KeyError: | ||
raise ValueError("{} is an invalid extractor".format(ext[0])) | ||
|
||
list_ext.append(ext_obj) | ||
|
||
return list_ext | ||
|
||
def run(self, input_filepath, output_folder): | ||
""" | ||
Extract a single file | ||
Args: | ||
input_filepath (str): Input file path | ||
output_folder (str): Output folder path | ||
""" | ||
_logger.info("Running all extractors...") | ||
|
||
for ext in self.extractors: | ||
_ = ext.run(input_filepath, output_folder) | ||
|
||
def run_folder(self, input_folder, output_folder): | ||
""" | ||
Extract for all files mentioned in folder. | ||
(The logic is based on each child.) | ||
Args: | ||
input_folder (A str): Input folder path | ||
output_folder (A str): Output folder path | ||
""" | ||
for ext in self.extractors: | ||
ext.run_folder(input_folder, output_folder) |
Oops, something went wrong.