Merge pull request #63 from chauhankaranraj/update-nb-s3

Update demo2 notebooks to read/write data from/to s3
os-climate · Oct 25, 2021 · dd380c7 · dd380c7
2 parents f48055c + c33b183
commit dd380c7
Show file tree

Hide file tree

Showing 16 changed files with 416 additions and 6,582 deletions.
diff --git a/data/annotations/.gitkeep b/data/annotations/.gitkeep
diff --git a/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx b/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx
diff --git a/data/curation/.gitkeep b/data/curation/.gitkeep
diff --git a/data/curation/esg_TEXT_dataset.csv b/data/curation/esg_TEXT_dataset.csv
diff --git a/data/extraction/.gitkeep b/data/extraction/.gitkeep
diff --git a/data/extraction/sustainability-report-2019.json b/data/extraction/sustainability-report-2019.json
diff --git a/data/kpi_mapping/.gitkeep b/data/kpi_mapping/.gitkeep
diff --git a/data/kpi_mapping/ESG/kpi_mapping.csv b/data/kpi_mapping/ESG/kpi_mapping.csv
diff --git a/data/pdfs/.gitkeep b/data/pdfs/.gitkeep
diff --git a/data/pdfs/ESG/sustainability-report-2019.pdf b/data/pdfs/ESG/sustainability-report-2019.pdf
diff --git a/notebooks/demo2/config.py b/notebooks/demo2/config.py
@@ -16,6 +16,11 @@
 BASE_EXTRACTION_FOLDER = DATA_FOLDER / "extraction"
 BASE_CURATION_FOLDER = DATA_FOLDER / "curation"
 
+DATA_S3_PREFIX = "corpdata/ESG"
+BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs"
+BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations"
+BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction"
+BASE_CURATION_S3_PREFIX = f"{DATA_S3_PREFIX}/curation"
 
 ckpt = "icdar_19b2_v2.pth"
 config_file = "cascade_mask_rcnn_hrnetv2p_w32_20e_v2.py"

diff --git a/notebooks/demo2/pdf_table_curation.ipynb b/notebooks/demo2/pdf_table_curation.ipynb
@@ -20,11 +20,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "import glob\n",
+    "import pathlib\n",
     "import logging\n",
     "import pandas as pd\n",
+    "from dotenv import load_dotenv\n",
     "\n",
     "import config\n",
+    "from src.data.s3_communication import S3Communication\n",
     "from src.components.preprocessing import TableCurator\n",
     "\n",
     "logger = logging.getLogger()"
@@ -35,6 +39,49 @@
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# Load credentials\n",
+    "dotenv_dir = os.environ.get(\n",
+    "    \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n",
+    ")\n",
+    "dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n",
+    "if os.path.exists(dotenv_path):\n",
+    "    load_dotenv(dotenv_path=dotenv_path, override=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# init s3 connector\n",
+    "s3c = S3Communication(\n",
+    "    s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n",
+    "    aws_access_key_id=os.getenv(\"S3_ACCESS_KEY\"),\n",
+    "    aws_secret_access_key=os.getenv(\"S3_SECRET_KEY\"),\n",
+    "    s3_bucket=os.getenv(\"S3_BUCKET\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download annotation files\n",
+    "s3c.download_files_in_prefix_to_dir(\n",
+    "    config.BASE_ANNOTATION_S3_PREFIX,\n",
+    "    config.BASE_ANNOTATION_FOLDER,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# initialize table curator\n",
     "tb_cur = TableCurator(\n",
@@ -49,23 +96,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['/home/kachau/Documents/diffbranch/aicoe-osc-demo/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx']"
+       "['/home/kachau/Documents/aicoe-osc-demo/data/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx']"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# excel sheets containing manually labelled data\n",
-    "annotation_excels = glob.glob('{}/[!~$]*[.xlsx]'.format(config.BASE_ANNOTATION_FOLDER / \"ESG\"))\n",
+    "annotation_excels = glob.glob('{}/[!~$]*[.xlsx]'.format(config.BASE_ANNOTATION_FOLDER))\n",
     "annotation_excels"
    ]
   },
@@ -78,7 +125,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -102,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -585,7 +632,7 @@
        "39   239 million tonnes CO2e  sustainability-report-2019_page16_1.csv      1  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -596,6 +643,32 @@
     "df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "200"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# upload the curation file to s3\n",
+    "ret = s3c.upload_file_to_s3(\n",
+    "    config.BASE_CURATION_FOLDER / \"esg_TABLE_dataset.csv\",\n",
+    "    config.BASE_CURATION_S3_PREFIX,\n",
+    "    \"esg_TABLE_dataset.csv\",\n",
+    ")\n",
+    "ret['ResponseMetadata']['HTTPStatusCode']"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/notebooks/demo2/pdf_table_extraction.ipynb b/notebooks/demo2/pdf_table_extraction.ipynb
diff --git a/notebooks/demo2/pdf_text_curation.ipynb b/notebooks/demo2/pdf_text_curation.ipynb
@@ -15,9 +15,14 @@
    "outputs": [],
    "source": [
     "# Author: ALLIANZ NLP esg data pipeline\n",
-    "from src.components.preprocessing import Curator\n",
+    "import os\n",
+    "import pathlib\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "\n",
     "import config\n",
-    "import os"
+    "from src.components.preprocessing import Curator\n",
+    "from src.data.s3_communication import S3Communication"
    ]
   },
   {
@@ -26,9 +31,54 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "XLS = os.path.join(config.BASE_ANNOTATION_FOLDER, \"ESG\")\n",
-    "EXT_FOLDER = config.BASE_EXTRACTION_FOLDER\n",
-    "CUR_FOLDER = config.BASE_CURATION_FOLDER"
+    "# Load credentials\n",
+    "dotenv_dir = os.environ.get(\n",
+    "    \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n",
+    ")\n",
+    "dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n",
+    "if os.path.exists(dotenv_path):\n",
+    "    load_dotenv(dotenv_path=dotenv_path, override=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# init s3 connector\n",
+    "s3c = S3Communication(\n",
+    "    s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n",
+    "    aws_access_key_id=os.getenv(\"S3_ACCESS_KEY\"),\n",
+    "    aws_secret_access_key=os.getenv(\"S3_SECRET_KEY\"),\n",
+    "    s3_bucket=os.getenv(\"S3_BUCKET\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download the files created by the extraction phase\n",
+    "s3c.download_files_in_prefix_to_dir(\n",
+    "    config.BASE_EXTRACTION_S3_PREFIX,\n",
+    "    config.BASE_EXTRACTION_FOLDER,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download the annoatation files\n",
+    "s3c.download_files_in_prefix_to_dir(\n",
+    "    config.BASE_ANNOTATION_S3_PREFIX,\n",
+    "    config.BASE_ANNOTATION_FOLDER,\n",
+    ")"
    ]
   },
   {
@@ -40,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -229,7 +279,33 @@
    ],
    "source": [
     "cur = Curator([(\"TextCurator\", TextCurator_kwargs)])\n",
-    "cur.run(EXT_FOLDER, XLS, CUR_FOLDER)"
+    "cur.run(config.BASE_EXTRACTION_FOLDER, config.BASE_ANNOTATION_FOLDER, config.BASE_CURATION_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "200"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# upload the curation file to s3\n",
+    "ret = s3c.upload_file_to_s3(\n",
+    "    config.BASE_CURATION_FOLDER / \"esg_TEXT_dataset.csv\",\n",
+    "    config.BASE_CURATION_S3_PREFIX,\n",
+    "    \"esg_TEXT_dataset.csv\",\n",
+    ")\n",
+    "ret['ResponseMetadata']['HTTPStatusCode']"
    ]
   },
   {
@@ -257,7 +333,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.8.11"
   }
  },
  "nbformat": 4,