Merge pull request #27 from Shreyanand/demo2

Add text extraction notebook and other demo2 files
os-climate · Oct 11, 2021 · fbb75d6 · fbb75d6
2 parents 70c9135 + d61522c
commit fbb75d6
Show file tree

Hide file tree

Showing 24 changed files with 3,246 additions and 106 deletions.
diff --git a/Pipfile b/Pipfile
@@ -17,6 +17,15 @@ trino = "*"
 pandas = "*"
 pyarrow = "*"
 python-dotenv = "*"
+pdf2image = "*"
+mmdet = "*"
+mmcv-full = "*"
+torch = "*"
+torchvision = "*"
+tabula-py = "*"
+gdown = "*"
+pdfminer.six = "*"
+fuzzywuzzy = "*"
 python-Levenshtein = "*"
 
 [requires]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx b/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx
diff --git a/data/extraction/esg_TEXT_dataset.csv b/data/extraction/esg_TEXT_dataset.csv
diff --git a/data/extraction/sustainability-report-2019.json b/data/extraction/sustainability-report-2019.json
diff --git a/data/pdfs/ESG/sustainability-report-2019.pdf b/data/pdfs/ESG/sustainability-report-2019.pdf
diff --git a/notebooks/demo2/pdf_text_extraction.ipynb b/notebooks/demo2/pdf_text_extraction.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Text Extraction\n",
+    "As a first step of the pipeline, we aim to extract text from PDFs in this notebook. The input PDFs for this notebook is in the `ROOT/data/pdfs` directory and the output json will be stored in `ROOT/data/extract` directory. The output from this notebook combined with the annotations will be used in the next step of curation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Author: ALLIANZ NLP esg data pipeline\n",
+    "from src.components.preprocessing import Extractor\n",
+    "import src.components.config as config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PDF_FOLDER = config.BASE_PDF_FOLDER\n",
+    "EXT_FOLDER = config.BASE_EXTRACTION_FOLDER"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call text extracter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PDFTextExtractor_kwargs = {\n",
+    "    \"min_paragraph_length\": 30,\n",
+    "    \"annotation_folder\": None,\n",
+    "    \"skip_extracted_files\": False,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['/home/shanand/aicoe-osc-demo/data/pdfs/ESG/sustainability-report-2019.pdf']\n"
+     ]
+    }
+   ],
+   "source": [
+    "ext = Extractor([(\"PDFTextExtractor\", PDFTextExtractor_kwargs)])\n",
+    "ext.run_folder(PDF_FOLDER, EXT_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Conclusion\n",
+    "We called the Extractor class to extract text from the PDF and store the ouput in the `ROOT/data/extraction` folder."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/__init__.py b/src/__init__.py
@@ -1 +0,0 @@
-"""Analytics compiled into python code."""

diff --git a/src/components/config.py b/src/components/config.py
@@ -0,0 +1,92 @@
+import os
+import src
+import pathlib
+import torch
+
+# General config
+STAGE = "extract"  # "extract" | "curate "
+SEED = 42
+
+ROOT = pathlib.Path(src.__file__).resolve().parent.parent
+CONFIG_FOLDER = ROOT / "config"
+CHECKPOINT_FOLDER = ROOT / "checkpoint"
+# the data for demo notebooks is located at sample_data directory
+DATA_FOLDER = ROOT / "data"
+BASE_PDF_FOLDER = DATA_FOLDER / "pdfs"
+BASE_ANNOTATION_FOLDER = DATA_FOLDER / "annotations"
+BASE_EXTRACTION_FOLDER = DATA_FOLDER / "extraction"
+BASE_CURATION_FOLDER = DATA_FOLDER / "curation"
+
+
+if not os.path.exists(BASE_EXTRACTION_FOLDER):
+    os.mkdir(BASE_EXTRACTION_FOLDER)
+if not os.path.exists(BASE_CURATION_FOLDER):
+    os.mkdir(BASE_CURATION_FOLDER)
+
+ckpt = "icdar_19b2_v2.pth" if "cpu" in torch.__version__ else "icdar_19b2.pth"
+config_file = (
+    "cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py"
+    if "cpu" in torch.__version__
+    else "cascade_mask_rcnn_hrnetv2p_w32_20e.py"
+)
+PDFTableExtractor_kwargs = {
+    "batch_size": -1,
+    "cscdtabnet_config": CONFIG_FOLDER / config_file,
+    "cscdtabnet_ckpt": CHECKPOINT_FOLDER / ckpt,
+    "bbox_thres": 0.85,
+    "dpi": 200,
+}
+
+# PDFTextExtractor
+PDFTextExtractor_kwargs = {
+    "min_paragraph_length": 30,
+    # Set to  ANNOTATION_FOLDER if you want to extract just pdfs mentioned in the annotations
+    # Set to None to extract all pdfs in pdf folder (for production stage)
+    "annotation_folder": None,
+    "skip_extracted_files": False,
+}
+
+TableCurator_kwargs = {
+    "neg_pos_ratio": 1,
+    "create_neg_samples": True,
+    "columns_to_read": [
+        "company",
+        "source_file",
+        "source_page",
+        "kpi_id",
+        "year",
+        "answer",
+        "data_type",
+    ],
+    "company_to_exclude": ["CEZ"],
+    "seed": SEED,
+}
+
+TextCurator_kwargs = {
+    "retrieve_paragraph": False,
+    "neg_pos_ratio": 1,
+    "columns_to_read": [
+        "company",
+        "source_file",
+        "source_page",
+        "kpi_id",
+        "year",
+        "answer",
+        "data_type",
+        "relevant_paragraphs",
+    ],
+    "company_to_exclude": [],
+    "create_neg_samples": True,
+    "seed": SEED,
+}
+
+# Components
+EXTRACTORS = [
+    # ("PDFTableExtractor", PDFTableExtractor_kwargs),
+    ("PDFTextExtractor", PDFTextExtractor_kwargs)
+]
+
+CURATORS = [
+    ("TextCurator", TextCurator_kwargs)
+    # ,("TableCurator", TableCurator_kwargs)
+]
diff --git a/src/components/preprocessing/__init__.py b/src/components/preprocessing/__init__.py
@@ -0,0 +1,9 @@
+from .base_component import BaseComponent  # noqa F401
+from .pdf_table_extractor import PDFTableExtractor  # noqa F401
+from .pdf_text_extractor import PDFTextExtractor  # noqa F401
+from .extractor import Extractor  # noqa F401
+from .nq_extractor import NQExtractor  # noqa F401
+from .nq_curator import NQCurator  # noqa F401
+from .curator import Curator  # noqa F401
+from .text_curator import TextCurator  # noqa F401
+from .table_curator import TableCurator  # noqa F401
diff --git a/src/components/preprocessing/base_component.py b/src/components/preprocessing/base_component.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+
+
+class BaseComponent(ABC):
+    def __init__(self, name="Base"):
+        self.name = name
+
+    @abstractmethod
+    def run(self, *args, **kwargs):
+        pass
diff --git a/src/components/preprocessing/base_curator.py b/src/components/preprocessing/base_curator.py
@@ -0,0 +1,42 @@
+import re
+from abc import abstractmethod
+
+from .base_component import BaseComponent
+
+
+# Remember to also implement BaseComponent's abstract methods for child classes
+# of this class
+class BaseCurator(BaseComponent):
+    def __init__(self, name="BaseCurator"):
+        self.name = name
+
+    @abstractmethod
+    def process_single_annotation_file(self, annotation_filepath, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def create_pos_examples(self, row, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def create_negative_examples(self, row, *args, **kwargs):
+        pass
+
+    @staticmethod
+    def clean_text(text):
+        """
+        Clean text
+
+        Args:
+            text (A str)
+        """
+        # Substitute  unusual quotes at the start of the string with usual quotes
+        text = re.sub("(?<=\[)“", '"', text)  # noqa W605
+        # Substitute  unusual quotes at the end of the string with usual quotes
+        text = re.sub("”(?=\])", '"', text)  # noqa W605
+        # Substitute th remaining unusual quotes with space
+        text = re.sub("“|”", "", text)
+        text = re.sub("\n|\t", " ", text)
+        text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]", "", text)
+        text = re.sub(r"\s{2,}", " ", text)
+        return text
diff --git a/src/components/preprocessing/curator.py b/src/components/preprocessing/curator.py
@@ -0,0 +1,50 @@
+import glob
+import logging
+
+from .text_curator import TextCurator
+from .table_curator import TableCurator
+
+logger = logging.getLogger(__name__)
+NAME_CLASS_MAPPING = {"TextCurator": TextCurator, "TableCurator": TableCurator}
+
+
+class Curator:
+    """A data curator component responsible for creating table and text training data based on annotated data
+    Args:
+        annotation_folder (str): path to the folder containing annotation excel files
+    """
+
+    def __init__(self, curators):
+        self.curators = self.__create_curators(curators)
+
+    def __create_curators(self, curators):
+        """
+        Returns a list of curator objects
+
+        Args:
+            curators (A list of str)
+        """
+        list_cura = []
+        for cura in curators:
+            try:
+                cura_obj = NAME_CLASS_MAPPING[cura[0]](**cura[1])
+            except KeyError:
+                raise ValueError("{} is an invalid extractor".format(cura[0]))
+
+            list_cura.append(cura_obj)
+
+        return list_cura
+
+    def run(self, input_extraction_folder, annotation_folder, output_folder):
+        """Runs curation for each curator.
+
+        Args:
+            input_extraction_folder (A str or PosixPath)
+            annotation_folder (A str or PosixPath)
+            output_folder (A str or PosixPath)
+        """
+        annotation_excels = glob.glob("{}/[!~$]*[.xlsx]".format(annotation_folder))
+        logger.info("Received {} excel files".format(len(annotation_excels)))
+
+        for curator_obj in self.curators:
+            curator_obj.run(input_extraction_folder, annotation_excels, output_folder)
diff --git a/src/components/preprocessing/extractor.py b/src/components/preprocessing/extractor.py
@@ -0,0 +1,63 @@
+from .pdf_table_extractor import PDFTableExtractor
+from .pdf_text_extractor import PDFTextExtractor
+import logging
+
+_logger = logging.getLogger(__name__)
+NAME_CLASS_MAPPING = {
+    "PDFTextExtractor": PDFTextExtractor,
+    "PDFTableExtractor": PDFTableExtractor,
+}
+
+
+class Extractor:
+    def __init__(self, extractors):
+        """
+        A pipeline extractor which combines different types of extractors
+
+        Args:
+            extractors (A list of tuples): (Name of extractor, kwargs_dict)
+        """
+        self.extractors = self.__create_extractors(extractors)
+
+    def __create_extractors(self, extractors):
+        """Returns a list of extractors objects
+
+        Args:
+            extractors (A list of str)
+        """
+        list_ext = []
+        for ext in extractors:
+            try:
+                ext_obj = NAME_CLASS_MAPPING[ext[0]](**ext[1])
+            except KeyError:
+                raise ValueError("{} is an invalid extractor".format(ext[0]))
+
+            list_ext.append(ext_obj)
+
+        return list_ext
+
+    def run(self, input_filepath, output_folder):
+        """
+        Extract a single file
+
+        Args:
+            input_filepath (str): Input file path
+            output_folder (str): Output folder path
+
+        """
+        _logger.info("Running all extractors...")
+
+        for ext in self.extractors:
+            _ = ext.run(input_filepath, output_folder)
+
+    def run_folder(self, input_folder, output_folder):
+        """
+        Extract for all files mentioned in folder.
+        (The logic is based on each child.)
+
+        Args:
+            input_folder (A str): Input folder path
+            output_folder (A str): Output folder path
+        """
+        for ext in self.extractors:
+            ext.run_folder(input_folder, output_folder)