Skip to content

Commit

Permalink
Merge pull request #63 from chauhankaranraj/update-nb-s3
Browse files Browse the repository at this point in the history
Update demo2 notebooks to read/write data from/to s3
  • Loading branch information
Shreyanand authored Oct 25, 2021
2 parents f48055c + c33b183 commit dd380c7
Show file tree
Hide file tree
Showing 16 changed files with 416 additions and 6,582 deletions.
Empty file added data/annotations/.gitkeep
Empty file.
Binary file not shown.
Empty file added data/curation/.gitkeep
Empty file.
6,415 changes: 0 additions & 6,415 deletions data/curation/esg_TEXT_dataset.csv

This file was deleted.

Empty file added data/extraction/.gitkeep
Empty file.
1 change: 0 additions & 1 deletion data/extraction/sustainability-report-2019.json

This file was deleted.

Empty file added data/kpi_mapping/.gitkeep
Empty file.
25 changes: 0 additions & 25 deletions data/kpi_mapping/ESG/kpi_mapping.csv

This file was deleted.

Empty file added data/pdfs/.gitkeep
Empty file.
Binary file removed data/pdfs/ESG/sustainability-report-2019.pdf
Binary file not shown.
5 changes: 5 additions & 0 deletions notebooks/demo2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
BASE_EXTRACTION_FOLDER = DATA_FOLDER / "extraction"
BASE_CURATION_FOLDER = DATA_FOLDER / "curation"

DATA_S3_PREFIX = "corpdata/ESG"
BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs"
BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations"
BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction"
BASE_CURATION_S3_PREFIX = f"{DATA_S3_PREFIX}/curation"

ckpt = "icdar_19b2_v2.pth"
config_file = "cascade_mask_rcnn_hrnetv2p_w32_20e_v2.py"
Expand Down
87 changes: 80 additions & 7 deletions notebooks/demo2/pdf_table_curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import pathlib\n",
"import logging\n",
"import pandas as pd\n",
"from dotenv import load_dotenv\n",
"\n",
"import config\n",
"from src.data.s3_communication import S3Communication\n",
"from src.components.preprocessing import TableCurator\n",
"\n",
"logger = logging.getLogger()"
Expand All @@ -35,6 +39,49 @@
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load credentials\n",
"dotenv_dir = os.environ.get(\n",
" \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n",
")\n",
"dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n",
"if os.path.exists(dotenv_path):\n",
" load_dotenv(dotenv_path=dotenv_path, override=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# init s3 connector\n",
"s3c = S3Communication(\n",
" s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n",
" aws_access_key_id=os.getenv(\"S3_ACCESS_KEY\"),\n",
" aws_secret_access_key=os.getenv(\"S3_SECRET_KEY\"),\n",
" s3_bucket=os.getenv(\"S3_BUCKET\"),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# download annotation files\n",
"s3c.download_files_in_prefix_to_dir(\n",
" config.BASE_ANNOTATION_S3_PREFIX,\n",
" config.BASE_ANNOTATION_FOLDER,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# initialize table curator\n",
"tb_cur = TableCurator(\n",
Expand All @@ -49,23 +96,23 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/kachau/Documents/diffbranch/aicoe-osc-demo/data/annotations/ESG/20201030 1Qbit aggregated_annotations_needs_correction.xlsx']"
"['/home/kachau/Documents/aicoe-osc-demo/data/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx']"
]
},
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# excel sheets containing manually labelled data\n",
"annotation_excels = glob.glob('{}/[!~$]*[.xlsx]'.format(config.BASE_ANNOTATION_FOLDER / \"ESG\"))\n",
"annotation_excels = glob.glob('{}/[!~$]*[.xlsx]'.format(config.BASE_ANNOTATION_FOLDER))\n",
"annotation_excels"
]
},
Expand All @@ -78,7 +125,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -102,7 +149,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -585,7 +632,7 @@
"39 239 million tonnes CO2e sustainability-report-2019_page16_1.csv 1 "
]
},
"execution_count": 5,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -596,6 +643,32 @@
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# upload the curation file to s3\n",
"ret = s3c.upload_file_to_s3(\n",
" config.BASE_CURATION_FOLDER / \"esg_TABLE_dataset.csv\",\n",
" config.BASE_CURATION_S3_PREFIX,\n",
" \"esg_TABLE_dataset.csv\",\n",
")\n",
"ret['ResponseMetadata']['HTTPStatusCode']"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
266 changes: 158 additions & 108 deletions notebooks/demo2/pdf_table_extraction.ipynb

Large diffs are not rendered by default.

94 changes: 85 additions & 9 deletions notebooks/demo2/pdf_text_curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
"outputs": [],
"source": [
"# Author: ALLIANZ NLP esg data pipeline\n",
"from src.components.preprocessing import Curator\n",
"import os\n",
"import pathlib\n",
"from dotenv import load_dotenv\n",
"\n",
"\n",
"import config\n",
"import os"
"from src.components.preprocessing import Curator\n",
"from src.data.s3_communication import S3Communication"
]
},
{
Expand All @@ -26,9 +31,54 @@
"metadata": {},
"outputs": [],
"source": [
"XLS = os.path.join(config.BASE_ANNOTATION_FOLDER, \"ESG\")\n",
"EXT_FOLDER = config.BASE_EXTRACTION_FOLDER\n",
"CUR_FOLDER = config.BASE_CURATION_FOLDER"
"# Load credentials\n",
"dotenv_dir = os.environ.get(\n",
" \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n",
")\n",
"dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n",
"if os.path.exists(dotenv_path):\n",
" load_dotenv(dotenv_path=dotenv_path, override=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# init s3 connector\n",
"s3c = S3Communication(\n",
" s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n",
" aws_access_key_id=os.getenv(\"S3_ACCESS_KEY\"),\n",
" aws_secret_access_key=os.getenv(\"S3_SECRET_KEY\"),\n",
" s3_bucket=os.getenv(\"S3_BUCKET\"),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# download the files created by the extraction phase\n",
"s3c.download_files_in_prefix_to_dir(\n",
" config.BASE_EXTRACTION_S3_PREFIX,\n",
" config.BASE_EXTRACTION_FOLDER,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# download the annoatation files\n",
"s3c.download_files_in_prefix_to_dir(\n",
" config.BASE_ANNOTATION_S3_PREFIX,\n",
" config.BASE_ANNOTATION_FOLDER,\n",
")"
]
},
{
Expand All @@ -40,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -66,7 +116,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -229,7 +279,33 @@
],
"source": [
"cur = Curator([(\"TextCurator\", TextCurator_kwargs)])\n",
"cur.run(EXT_FOLDER, XLS, CUR_FOLDER)"
"cur.run(config.BASE_EXTRACTION_FOLDER, config.BASE_ANNOTATION_FOLDER, config.BASE_CURATION_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# upload the curation file to s3\n",
"ret = s3c.upload_file_to_s3(\n",
" config.BASE_CURATION_FOLDER / \"esg_TEXT_dataset.csv\",\n",
" config.BASE_CURATION_S3_PREFIX,\n",
" \"esg_TEXT_dataset.csv\",\n",
")\n",
"ret['ResponseMetadata']['HTTPStatusCode']"
]
},
{
Expand Down Expand Up @@ -257,7 +333,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
"version": "3.8.11"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit dd380c7

Please sign in to comment.