From 56ebc15d07f42e4cf7ab7f6c96f2047361db04e4 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Fri, 27 Oct 2023 07:00:45 +0000 Subject: [PATCH 01/21] refactor: add lazy load method to save memory --- src/scripts/dep/llm_bot_dep/extraction_utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/scripts/dep/llm_bot_dep/extraction_utils.py b/src/scripts/dep/llm_bot_dep/extraction_utils.py index bd720fab..ebea7e9b 100644 --- a/src/scripts/dep/llm_bot_dep/extraction_utils.py +++ b/src/scripts/dep/llm_bot_dep/extraction_utils.py @@ -1,8 +1,7 @@ import logging import subprocess from pathlib import Path -from typing import Dict, List, Optional - +from typing import Dict, List, Optional, Iterator from langchain.document_loaders.pdf import BasePDFLoader from langchain.docstore.document import Document @@ -63,12 +62,17 @@ def nougat(self, file_path: Path) -> str: def load(self) -> List[Document]: """Loads and processes the specified PDF file, converting it to a list of Document objects. - This method reads the PDF file, processes it using the `nougat` command, - reads the resulting Markdown content, and constructs a Document object with the content. - Returns: List[Document]: A list containing a single Document object with the processed content. """ + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """Lazy load and process the specified PDF file, yielding Document objects. + + This method reads the PDF file, processes it using the `nougat` command, + reads the resulting Markdown content, and yields a Document object with the content. + """ try: file_path = self.file_path # Call the method to run the Nougat OCR command @@ -87,7 +91,7 @@ def load(self) -> List[Document]: .replace(r"\]", "$$") ) metadata = {"source": self.file_path} - return [Document(page_content=content, metadata=metadata)] + yield Document(page_content=content, metadata=metadata) except Exception as e: logging.error(f"An error occurred while processing the PDF: {str(e)}") From 24e50d70c25e132f7abe56b6fe2e4393e045b529 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Fri, 27 Oct 2023 11:09:33 +0000 Subject: [PATCH 02/21] feat: nougat loader and splitter class --- .../{extraction_utils.py => loader_utils.py} | 0 src/scripts/dep/llm_bot_dep/splitter_utils.py | 78 ++++++++++++++++++ src/scripts/glue-job-script.py | 2 +- .../whl/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 8272 -> 11106 bytes 4 files changed, 79 insertions(+), 1 deletion(-) rename src/scripts/dep/llm_bot_dep/{extraction_utils.py => loader_utils.py} (100%) create mode 100644 src/scripts/dep/llm_bot_dep/splitter_utils.py diff --git a/src/scripts/dep/llm_bot_dep/extraction_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py similarity index 100% rename from src/scripts/dep/llm_bot_dep/extraction_utils.py rename to src/scripts/dep/llm_bot_dep/loader_utils.py diff --git a/src/scripts/dep/llm_bot_dep/splitter_utils.py b/src/scripts/dep/llm_bot_dep/splitter_utils.py new file mode 100644 index 00000000..6efe4ffd --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/splitter_utils.py @@ -0,0 +1,78 @@ +from langchain.docstore.document import Document +from langchain.text_splitter import ( + RecursiveCharacterTextSplitter, + Language, + TextSplitter, +) +from typing import Any, Dict, List, Optional, Iterator + +# TODO: rewrite this function to use the new TextSplitter for mmd type, and this class is not imported into glue job for now +class MarkdownTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Markdown-formatted headings.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a MarkdownTextSplitter.""" + separators = self.get_separators_for_language(Language.MARKDOWN) + super().__init__(separators=separators, **kwargs) + +def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy + try: + import spacy + except ImportError: + raise ImportError( + "Spacy is not installed, please install it with `pip install spacy`." + ) + if pipeline == "sentencizer": + from spacy.lang.en import English + + sentencizer = English() + sentencizer.add_pipe("sentencizer") + else: + sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) + return sentencizer + +class NLTKTextSplitter(TextSplitter): + """Splitting text using NLTK package.""" + + def __init__( + self, separator: str = "\n\n", language: str = "english", **kwargs: Any + ) -> None: + """Initialize the NLTK splitter.""" + super().__init__(**kwargs) + try: + from nltk.tokenize import sent_tokenize + + self._tokenizer = sent_tokenize + except ImportError: + raise ImportError( + "NLTK is not installed, please install it with `pip install nltk`." + ) + self._separator = separator + self._language = language + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + splits = self._tokenizer(text, language=self._language) + return self._merge_splits(splits, self._separator) + +class SpacyTextSplitter(TextSplitter): + """Splitting text using Spacy package. + + + Per default, Spacy's `en_core_web_sm` model is used. For a faster, but + potentially less accurate splitting, you can use `pipeline='sentencizer'`. + """ + + def __init__( + self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any + ) -> None: + """Initialize the spacy text splitter.""" + super().__init__(**kwargs) + self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline) + self._separator = separator + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + splits = (s.text for s in self._tokenizer(text).sents) + return self._merge_splits(splits, self._separator) \ No newline at end of file diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index ea25ac1b..c77c65d8 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -14,7 +14,7 @@ from opensearchpy import RequestsHttpConnection from awsglue.utils import getResolvedOptions -from llm_bot_dep import sm_utils, aos_utils, enhance_utils, extraction_utils +from llm_bot_dep import sm_utils, aos_utils, enhance_utils, loader_utils from requests_aws4auth import AWS4Auth logger = logging.getLogger() diff --git a/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl index d276ac3cb34ca987bd3bd284e9bd2418f648ff28..a2d8d9b5f70994b93b672bdda11003f809981ef5 100644 GIT binary patch delta 3533 zcmZvfc{CK<|Hp^2GdyG;Bm0nL#$-44E&I+e*&18+eIn+ONHkY6e6Ub`kn9j{+{1??!BM$IrrYzdH?lU&Sywt$(Hn-4Z#$Nv>O9!h4)ec0B6hq z051Rl@b&d`b_)n~M!N^f_y)M5-GiOOLcM)Mqyul^?EC_zNI%`6BB?v$uFBa#x z^7ae)nccMa8gs4g+;&I=v$&1il%mDfj_t>3jUW2c{yar+l04+WOC6!vm)5+)e)9`O z^MNPSACC_rBtjU1N+e^jzBwZ`lAKE;v1z8fl@=c&pX?M92`EoLsv09BsSv2V_qW2j z{j%!7J@QZXFFEGbUd+@Calgn)wAjYmcwDtaRQa}+UlD7IRs%UgTD|EPR~3ls6`8`I zcHt#P%5c=d_F-ud3%aD*_V&<@ZKD9ui{nNU{l%Des48bY|Z&E9{DO7XL(7lYpDHywq;wtJlL~)cYcD){Ze7V{*WA$pwbZt2Cj^{r7y3Ji* z0h=C*b|Ew^ONyOdl})vKgV}M*;yj2xt(0ghZ6d_r{)!8RqI(x85aYP7lnTMvlW^)A zJWDXd)u{Wm=_^aoRfeB8L&fY`{!g z`DWfUnYi2!9#2q&arCM`GpM3O9TiTGHpe0(`dUwFV~yGApJ0@^$474W9MnBdEXGT7 zhbSsIs>rmWR}_mmv7RqO%={4FGWzkdx&q~DtkMji}gKEe=A^O}(U&$2VJ96v6oC2bKk zTWOe)&qu)IntF!jw}F~;IJs0#!5kIs`+@u2X6(_aPPa@(u?olEm*P%#)i5!&sxDmp zm?6>9id0W`R+Y{t+n*}Lloc`+IGcc7ph}hE{R6Sl_O^>4eLPy_eM@mdZ{8-TP+NRz zBR(3`CLm^f`qn+K>8qLX1uX}P5SC*hPSa7Hzak~Fh9R$1Od>g1;w54_K@*;1lhC0OxWWGX6NZ9W&tz6Im&y- zO*tJyi4+Tkp?vSooib!s$>6p$b(Urv3vYEYT)OqJeTf<08tct06F)(db9f^rGUkop z!)QHW@zq0w^JXDN>&fyKz!58R8>l?y^0aIh!7%59)$CdSY4&u9!IsLuEQIk1J-no& zvsfn)>vJ;QA>b(^+Hk0+N3AN{O?7W@(N##5`Or*LP4HpbYMM6ffvc-%xTqKFWK5<) zEjueyW$5hL*B!t7$sBTzZ^RT%k{dMfmTJBrYL-VmC3I56rM|c8!-1Dz$Yq4u6UJPm zRfLriwo;jZUczV#X1c7xUVh{I&Oluagw=5K zZ}vfhR*A-SyGv}UOOvjf8?A=CEasjir4*JmM<5ti{*B0uC}G8&&+lMEa`$3O)46p- zPeH@C_Kdjrdrae(ehGzkYsp-j%};-NG0=iDH3o8h6*wrJkzIAvZvV6o}mCrVE z5sR>Mv}24~8Ctyn0C+I~0DS+i=!XRQdWVMopR(UOjHxQwyjaH(cFC0O>DyHMgQ?xhBagwW<@55SPFVoMth>R;eIzWZc7558LGwBz>}1q$a`-dxpV=fijxvum{544-aak1 zTnuYNyupfAhekj9XfaSzHm@)27{y{+{*gVpHdk)ICbc5slO~ffgi`1x)<;YJZ8&+y1t>WA=JId|u360{ADj+!KC(Vrb{Wh}8Ny3T}+bRFWaI8Fi zz-1@s1dO>cmPwIUH_n=Q+5Z|a!_6R=r|SCc`s1Us)zLsMV7tL+^Kq<=dqrpj+ zjad1edw~WA-9)R&uRyBI4Zj4uu*OJzGxd#{)ND8B3n;(|5rZfmeUxRljCg%~HCEPo zE;MOg;gIXa4yG>$LUPmWXYP7)D-b$G2DiTUt@{grR$IE0 z_+1iG=kJ}!2G^6;X0nc8SSpz401fO#e$P7)-T-CFz6iHyT;k>-I-24-O^VI-zq+Sc zI)|-zmQ20cpFwkfXqYS?5X#!xRqJ(&(Pm z0AWnlHlM6Vbp~tNQF7i#ndp|Hr9oL?m3yheBV`_R2;C!W6}XMO<*I#U6rvoqAe#;q;y0#7`LT%s&B_Zh*tKACL1k@2((I(DA)>KiKt78O)6?TqKadJcM{I z@`VI@M?yGv#~-iCll8}s^H{Q;zuQR{s8y&@0suqjWke7_2;>K2Ep@fcEOnxX`|&r- z0SSxI&J5mnU0@6X4%>Q(Iukj9z?6j}C7y=kY!fpP!t<8cvbo9@B2n`Woq_8KJ>Vt_ zh090jG4pphq#vEhQv-fRJriIzo}({e8VsfM`kayD4y1}tjD>Y!L0ktyolVve_77;F ztkz5sJlRDSI$Kd#g`8s)?QGkmaeQB~&#=Svu_+BBrBnOERJN2TXh0md9mCt=G5>5%uop29bm88qu}1uu{4o71%= z284?3YzuF>Ky@UWZF7ITxC-wLFeEt9^RA6Kc<_(gDVn-sf1!y~jt%!?2D&XD;oqqZ z&Sl8A*Y#a4MbM%bP51`B->;!b@HO*2 zx(Om{A19umZuTdp;;sS=1@n9SFjo3M3Xgl!@tv>hxE9OLgDrJdd}c+@ zGUuzb`J^eX{@ZYJ-m3nr>ezg?e;ZW@1z>hU5Ex8Z#7#&8i^%_5uluit5BQ%N|LJ_^ zDsakVLBc06P&XsPVnM?mAt0}ZaeuJR@%`9^xD(c^#jORP~Jnh=Ve&7BF DwUB4E delta 758 zcmaD9cEMr8O5x3GB~P$yo~p#e$ZQ!IIeC`yVKAMbG6PKiRtXXUiib}=bNKWVMg|5o zW(Ec&hRGMTnYQDVly0b) zZs1NUbYxx_IQgOfZHpDVssjFhom<-$$-Cxm!`brPyJfsO^q>3no-I@7pU|(a-?OTA znrxb8-}8I74&E}@SD%>IwDzo7c-;NZ7jh2UCGfZBO@E}X9xdL=Jt5%p%I;Sz!KbRH zKM)J~z#hh3q`be*XyJthhNp9*{KUmV&gSqItvi3D(Bvb_=f`)IBxml$$K=ln9?OCztGfB0HqGzGQpk}1_p*tK&%49 zFtDXDQxcq3{Io>C3KuRld=~?h1z|;)Dj>b3QC|wIaDkQ}SRvo^!=-#c2_7KUL{WGO kq|h9!2sKe6RBd6Dmj;{Wrp?F4#lXezor!_LSr()K02$yi!T Date: Sun, 29 Oct 2023 20:38:48 +0800 Subject: [PATCH 03/21] feat: add CSV loader to save the content in markdown format --- src/scripts/dep/llm_bot_dep/loader_utils.py | 190 +++++++++++++++++++- 1 file changed, 188 insertions(+), 2 deletions(-) diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index ebea7e9b..08c09354 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -1,9 +1,14 @@ import logging import subprocess from pathlib import Path -from typing import Dict, List, Optional, Iterator +from typing import Dict, List, Optional, Iterator, Sequence from langchain.document_loaders.pdf import BasePDFLoader +from langchain.document_loaders.csv_loader import CSVLoader from langchain.docstore.document import Document +import csv +from io import TextIOWrapper +from langchain.document_loaders.helpers import detect_file_encodings + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -22,6 +27,7 @@ class NougatPDFLoader(BasePDFLoader): ImportError: If the `nougat` library is not installed. RuntimeError: If the `nougat` command fails to execute successfully. """ + def __init__(self, file_path: str, *, headers: Optional[Dict] = None): """Initialize with a file path.""" try: @@ -96,9 +102,189 @@ def lazy_load(self) -> Iterator[Document]: except Exception as e: logging.error(f"An error occurred while processing the PDF: {str(e)}") + +class CustomCSVLoader(CSVLoader): + """Load a `CSV` file into a list of Documents. + + Each document represents one row of the CSV file. The rows are converted into markdown format based on row_count. + + Output Example: + when row_count = 1, + page_document_1 contains: + |index|name| + |-|-| + |1|Demo1| + page_document_2 contains: + |index|name| + |-|-| + |2|Demo2| + + when row_count = 3, + page_document_1 contains: + |index|name| + |-|-| + |1|Demo1| + |2|Demo2| + |3|Demo3| + page_document_2 contains: + |index|name| + |-|-| + |4|Demo4| + |5|Demo5| + |6|Demo6| + """ + + def __init__( + self, + file_path: str, + source_column: Optional[str] = None, + metadata_columns: Sequence[str] = (), + csv_args: Optional[Dict] = None, + encoding: Optional[str] = None, + autodetect_encoding: bool = False, + row_count: int = 1 + ): + """ + + Args: + file_path: The path to the CSV file. + source_column: The name of the column in the CSV file to use as the source. + Optional. Defaults to None. + metadata_columns: A sequence of column names to use as metadata. Optional. + csv_args: A dictionary of arguments to pass to the csv.DictReader. + Optional. Defaults to None. + encoding: The encoding of the CSV file. Optional. Defaults to None. + autodetect_encoding: Whether to try to autodetect the file encoding. + row_count: How many row in a page document. + """ + self.row_number = row_count + super().__init__(file_path, source_column, metadata_columns, + csv_args, encoding, autodetect_encoding) + + def __read_file(self, csvfile: TextIOWrapper) -> List[Document]: + docs = [] + + csv_reader = csv.DictReader(csvfile, **self.csv_args) + counter = 0 + for i, row in enumerate(csv_reader): + # print(f"i: {i}") + # print(f"row: {row}") + try: + source = ( + row[self.source_column] + if self.source_column is not None + else self.file_path + ) + except KeyError: + raise ValueError( + f"Source column '{self.source_column}' not found in CSV file." + ) + counter += 1 + + if counter % self.row_number == 1: + # First row with header and separator + header = "|" + md_separator = "|" + row_content = "|" + for k, v in row.items(): + header += k + "|" + md_separator += "-|" + row_content += v + "|" + row_content += "\n" + elif counter % self.row_number == 0: + if 1 == self.row_number: + header = "|" + md_separator = "|" + row_content = "|" + for k, v in row.items(): + header += k + "|" + md_separator += "-|" + row_content += v + "|" + else: + for k, v in row.items(): + row_content += v + "|" + content = header + "\n" + md_separator + "\n" + row_content + print(f"markdown content: {content}") + + metadata = {"source": source, "row": i} + for col in self.metadata_columns: + try: + metadata[col] = row[col] + except KeyError: + raise ValueError( + f"Metadata column '{col}' not found in CSV file.") + doc = Document(page_content=content, metadata=metadata) + docs.append(doc) + counter = 0 + else: + for k, v in row.items(): + row_content += v + "|" + row_content += "\n" + + return docs + + def load(self) -> List[Document]: + """Load data into document objects.""" + + docs = [] + try: + with open(self.file_path, newline="", encoding=self.encoding) as csvfile: + docs = self.__read_file(csvfile) + except UnicodeDecodeError as e: + if self.autodetect_encoding: + detected_encodings = detect_file_encodings(self.file_path) + for encoding in detected_encodings: + try: + with open( + self.file_path, newline="", encoding=encoding.encoding + ) as csvfile: + docs = self.__read_file(csvfile) + break + except UnicodeDecodeError: + continue + else: + raise RuntimeError(f"Error loading {self.file_path}") from e + except Exception as e: + raise RuntimeError(f"Error loading {self.file_path}") from e + + return docs + + # local debugging purpose # if __name__ == "__main__": # # local pdf file in current folder # loader = NougatPDFLoader('paperSnapshot.pdf') # data = loader.load() -# logging.info("text: %s", data) \ No newline at end of file +# logging.info("text: %s", data) + + +# TODO: Local debug CSV loader, remove it before release +# if __name__ == "__main__": +# import uuid +# import boto3 +# from datetime import datetime + +# s3 = boto3.client('s3') +# now = datetime.now() +# timestamp_str = now.strftime("%Y%m%d%H%M%S") +# print(timestamp_str) +# random_uuid = str(uuid.uuid4())[:8] +# print(random_uuid) + +# def process_csv(csv_content: str, kwargs): +# bucket_name = kwargs['bucket'] +# key = kwargs['key'] +# local_path = f'/temp-{timestamp_str}-{random_uuid}.csv' +# s3.download_file(bucket_name, key, local_path) + +# # loader = CustomCSVLoader(file_path=local_path, row_count=1) +# # loader = CustomCSVLoader(file_path=local_path, row_count=999) +# loader = CustomCSVLoader(file_path=local_path, row_count=2) +# # loader = CustomCSVLoader(file_path=local_path, row_count=3) +# data = loader.load() +# # print(data) + +# # TSV +# # process_csv("x", {'bucket': '', 'key': 'athena_results/OrderTable.tsv'}) +# # CSV +# process_csv("x", {'bucket': '', 'key': 'athena_results/sdps-api-test-s3-key-58h54muj.csv'}) From 45cb325619f01d81ae4766de3939823e2adddd8c Mon Sep 17 00:00:00 2001 From: yike5460 Date: Mon, 30 Oct 2023 10:02:32 +0000 Subject: [PATCH 04/21] feat: judge the token before query enhancement --- src/scripts/dep/llm_bot_dep/enhance_utils.py | 177 +++++++++++++----- src/scripts/dep/llm_bot_dep/loader_utils.py | 2 +- .../whl/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 11106 -> 13563 bytes 3 files changed, 131 insertions(+), 48 deletions(-) diff --git a/src/scripts/dep/llm_bot_dep/enhance_utils.py b/src/scripts/dep/llm_bot_dep/enhance_utils.py index 48a62e8b..88a53300 100644 --- a/src/scripts/dep/llm_bot_dep/enhance_utils.py +++ b/src/scripts/dep/llm_bot_dep/enhance_utils.py @@ -5,16 +5,24 @@ import logging import openai from typing import Dict, List +from langchain.docstore.document import Document +import nltk + # print the log to stdout logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# token number to slice a document +slice_size = 50 +# number of questions to generate +question_num = 5 + en_prompt_template = """ Here is snippet of {solution}'s manual document within backticks ``` {page} ``` -Please generate 10 questions and corresponding answers based on these document fragments, with the questions being as diverse as possible and containing details, following the rules below: +Please generate {question_num} questions and corresponding answers based on these document fragments, with the questions being as diverse as possible and containing details, following the rules below: 1. "{solution}" needs to be included in the Question continuously 2. The question part needs to start with "Question: " 3. The answer part needs to start with "Answer: " @@ -26,7 +34,7 @@ ``` {page} ``` -请基于这些文档片段自动生成10个问题以及对应答案, 问题需要尽可能多样化并包含细节, 且遵循如下规则: +请基于这些文档片段自动生成{question_num}个问题以及对应答案, 问题需要尽可能多样化并包含细节, 且遵循如下规则: 1. "{solution}"需要一直被包含在Question中 2. 问题部分需要以"Question: "开始 3. 答案部分需要以"Answer: "开始 @@ -34,7 +42,7 @@ """ class EnhanceWithBedrock: - def __init__(self, prompt: str, solution_title: str, page_content: str, zh: bool = True): + def __init__(self, prompt: str, solution_title: str, document: Document, zh: bool = True): BEDROCK_REGION = str(boto3.session.Session().region_name) # TODO, pass such credentials from CloudFormation creation and store in SSM openai.api_key = os.getenv("OPENAI_API_KEY") @@ -46,10 +54,10 @@ def __init__(self, prompt: str, solution_title: str, page_content: str, zh: bool ) self.prompt = prompt self.solution_title = solution_title - self.page_content = page_content + self.document = document self.zh = zh - def EnhanceWithClaude(self, prompt: str, solution_title: str, page_content: str, zh: bool = True) -> List[Dict[str, str]]: + def EnhanceWithClaude(self, prompt: str, solution_title: str, document: Document, zh: bool = True) -> List[Dict[str, str]]: """ Enhance the given prompt using the Claude model by Anthropic. This function constructs a new prompt using the given solution title and page content, sends a request to the Claude model, and retrieves the model's response. @@ -74,8 +82,10 @@ def EnhanceWithClaude(self, prompt: str, solution_title: str, page_content: str, Note: - Deprecated: Claude v2 does not output Chinese characters in experiment, so Claude v1 is used here. """ + # Initialize an empty list to store the Document objects + # documents = [] prompt_template = zh_prompt_template if zh else en_prompt_template - prompt = prompt_template.format(solution=solution_title, page=page_content) + prompt = prompt_template.format(solution=solution_title, page=document.page_content, question_num=question_num) prompt = "\n\nHuman:{}".format(prompt) + "\n\nAssistant:" # schema keep changing, refer to https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters.html#model-parameters-claude for latest schema body = json.dumps({ @@ -95,16 +105,31 @@ def EnhanceWithClaude(self, prompt: str, solution_title: str, page_content: str, body=body, modelId=modelId, accept=accept, contentType=contentType ) response_body = json.loads(response.get("body").read()) - raw_completion = response_body.get("completion") - converted_completion = [] - for qa in raw_completion.split("\n\n"): - if qa.startswith("Question:"): - converted_completion.append({"Question": qa.replace("Question:", "").strip()}) - elif qa.startswith("Answer:"): - converted_completion[-1]["Answer"] = qa.replace("Answer:", "").strip() - return converted_completion - - def EnhanceWithOpenAI(self, prompt: str, solution_title: str, page_content: str, zh: bool = True) -> List[Dict[str, str]]: + raw_completion = response_body.get("completion").split('\n') + + # Initialize an empty list to store the Q&A pairs + qa_list = [] + + # Initialize an empty dictionary to store the current Q&A pair + qa_dict = {} + for line in raw_completion: + # Check if the line contains a question + if line.startswith('Question:'): + # If there's already a Q&A pair in qa_dict, append it to qa_list + if qa_dict: + qa_list.append(qa_dict) + qa_dict = {} # Reset qa_dict for the next Q&A pair + qa_dict['Question'] = line.replace('Question:', '').strip() + # Check if the line contains an answer + elif line.startswith('Answer:'): + qa_dict['Answer'] = line.replace('Answer:', '').strip() + + # Append the last Q&A pair to qa_list + if qa_dict: + qa_list.append(qa_dict) + return qa_list + + def EnhanceWithOpenAI(self, prompt: str, solution_title: str, document: Document, zh: bool = True) -> List[Dict[str, str]]: """ Enhances a given prompt with additional information and performs a chat completion using OpenAI's GPT-3.5 Turbo model. @@ -122,37 +147,95 @@ def EnhanceWithOpenAI(self, prompt: str, solution_title: str, page_content: str, [{'Question': 'What is Solution Title?', 'Answer': 'It is ...'}] """ prompt_template = zh_prompt_template if zh else en_prompt_template - prompt = prompt_template.format(solution=solution_title, page=page_content) + prompt = prompt_template.format(solution=solution_title, page=document.page_content, question_num=question_num) messages = [{"role": "user", "content": f"{prompt}"}] - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo", - messages=messages, - temperature=0, - max_tokens=2048 - ) - raw_completion = response.choices[0]["message"]["content"] - converted_completion = [] - for qa in raw_completion.split("\n\n"): - if qa.startswith("Question:"): - converted_completion.append({"Question": qa.replace("Question:", "").strip()}) - elif qa.startswith("Answer:"): - converted_completion[-1]["Answer"] = qa.replace("Answer:", "").strip() - return converted_completion + # error and retry handling for openai api due to request cap limit + try: + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=messages, + temperature=0, + max_tokens=2048 + ) + except Exception as e: + logger.error("OpenAI API request failed: {}".format(e)) + return [] + raw_completion = response.choices[0]["message"]["content"].split('\n') + logger.info("raw_completion: {}".format(raw_completion)) + # Initialize an empty list to store the Q&A pairs + qa_list = [] + + # Initialize an empty dictionary to store the current Q&A pair + qa_dict = {} + for line in raw_completion: + # Check if the line contains a question + if line.startswith('Question:'): + # If there's already a Q&A pair in qa_dict, append it to qa_list + if qa_dict: + qa_list.append(qa_dict) + qa_dict = {} # Reset qa_dict for the next Q&A pair + qa_dict['Question'] = line.replace('Question:', '').strip() + # Check if the line contains an answer + elif line.startswith('Answer:'): + qa_dict['Answer'] = line.replace('Answer:', '').strip() + + # Append the last Q&A pair to qa_list + if qa_dict: + qa_list.append(qa_dict) + return qa_list + + def SplitDocumentByTokenNum(self, document: Document, token_num: str) -> List[Document]: + """ + Splits a given document into multiple documents, each containing a slice of the original document. + + Parameters: + - document (Document): The document to be split. + - token_num (int): The number of tokens to include in each document. + + Returns: + - List[Document]: A list of documents, each containing a slice of the original document. + """ + # Get the token number of input paragraph + tokens = nltk.word_tokenize(document.page_content) + # Calculate the total number of tokens and chunk number + total_tokens = len(tokens) + chunk_num = total_tokens // slice_size + 1 + + # Initial document list to sttore ducoment slices seperated by 50 tokens + documents_list = [] + # Iterate through the list of tokens, extracting slices of 50 tokens at a time + for i in range(0, len(tokens), slice_size): + token_slice = tokens[i:i+slice_size] + # Join the slice of tokens back into a string + document_slice = ' '.join(token_slice) + # Create new Document object to store the slice + document = Document(page_content=document_slice) + # Append the Document object to the list of documents + documents_list.append(document) + return documents_list # local debugging purpose -# if __name__ == "__main__": -# # test the function -# prompt = "Do we have any solution offer to Stable Diffusion?" -# solution_title = "Stable Diffusion AWS Extensions" -# page_content = "Stable Diffusion AWS Extensions is a CSDC solution that..." -# ewb = EnhanceWithBedrock(prompt, solution_title, page_content) -# enhanced_prompt = ewb.EnhanceWithClaude(prompt, solution_title, page_content) -# logger.info("Enhanced prompt: {}".format(enhanced_prompt)) - -# # test the function -# prompt = "Do we have any solution offer to Stable Diffusion?" -# solution_title = "Stable Diffusion AWS Extensions" -# page_content = "Stable Diffusion AWS Extensions is a CSDC solution that..." -# ewb = EnhanceWithBedrock(prompt, solution_title, page_content) -# enhanced_prompt = ewb.EnhanceWithOpenAI(prompt, solution_title, page_content) -# logger.info("Enhanced prompt: {}".format(enhanced_prompt)) +if __name__ == "__main__": + # test the function + prompt = "Do we have any solution offer to Stable Diffusion?" + solution_title = "Stable Diffusion AWS Extensions" + page_content = """ + Stable Diffusion AWS Extensions is a CSDC solution that... + """ + # construct a Document object + document = Document(page_content=page_content) + ewb = EnhanceWithBedrock(prompt, solution_title, document) + document_list = ewb.SplitDocumentByTokenNum(document, slice_size) + # test the function + for document in document_list: + prompt = "Do we have any solution offer to Stable Diffusion?" + solution_title = "Stable Diffusion AWS Extensions" + enhanced_prompt = ewb.EnhanceWithClaude(prompt, solution_title, document) + logger.info("Enhanced prompt: {}".format(enhanced_prompt)) + + # test the function + for document in document_list: + prompt = "Do we have any solution offer to Stable Diffusion?" + solution_title = "Stable Diffusion AWS Extensions" + enhanced_prompt = ewb.EnhanceWithOpenAI(prompt, solution_title, document) + logger.info("Enhanced prompt: {}".format(enhanced_prompt)) diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index 08c09354..0b887cfd 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -18,7 +18,7 @@ class NougatPDFLoader(BasePDFLoader): This class leverages the `nougat` library to perform the conversion from PDF to HTML. It inherits from `BasePDFLoader` and extends its functionality to utilize the `nougat` library. - + TODO, the load_and_split method need to be implemented and default is RecursiveCharacterTextSplitter Attributes: file_path (str): The path to the PDF file to be loaded. headers (Optional[Dict]): Optional headers to be used when loading the PDF. diff --git a/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl index a2d8d9b5f70994b93b672bdda11003f809981ef5..778c0c5a0bae9226a323a432b7b9c258e79ca287 100644 GIT binary patch delta 7497 zcmZXZWl$Vy)20V^cY^ESt|2hEySuw2xHGuBy99T43-0a^bb`ACg62E#+1=WDce|?V zuIj$}`PI*lyZS)u)wZD=2EA6vNP94`RfrS~02p9QX@#K%>;3&oGJ^N_{Udro@9c8G z`x17MN}j;N0fv>EqmfpF$UbW;uY1eutLMf>D?}%KFe4e(CS)`6PVxKvourBk7)Tqs z_`xMS_RRLOGktM!u`Nic0uqx5)}QNnI-2qQy7Stzkw-P5#2ZOHh`@|0OA_`u^jH?V zoal5^*NPNCvnMyqt-1O%a!X3Dsioj~}ckV$Lj$s0=Y zCg2kD*|nG#u2oLlD6Z@xAhHQ+Xk`Ia=pG_u4>ZhWJq&8clR*a(ynL}tuWJIba7#yz?_ezt&J?iLn1P6w4 z`FWk_&A+C(gg+`_RLi3a*bP$rbn-mydTFrl{T`cftdY0l!(qQv{hQOn89iUW&+Sd=@utP!mlgfF z0O6OpF&E_s@;>&ihtW{i(vJ6Y0;j&;b3;$UNmNo_!5V+y9^-d3>_z@AoGgN4q7SQ| z6p!OR{7oPBr%uEwulOq;v#4O_NDcbF-8@<{ct4D3;}$F)!xO)^#w;2d$q3NPF9b@F z`C^;xQ67~ywhOFL+{{^jIq1UN$vWT64eRAin#~KTkliX-ugQVa9bYzaZaH$pc1XDo z5f~qGMr>?IKdPY}f%HeDKgyS zxigTL&!n<1Cr}Qt>cGkOg*hmlj(46eB$T7;+XYxdW9d2%9Lxyv3RJbiI6H zQD23QpbP4r@Bme3nwMEFq&wyr=?e1nU(D7scPFsCuQKAPzCgDw8=r$6RYJd%GqGIw zm<{8(L1rrsWuh||B~#145DthbiCicjdeAnbGUm{`V zwV1uiuNu}#DI(;{%V*Cu#4~iVbMWN^ndsjY3F@K|{`9phl&f)dD`(cn3%-ZR6cn*J z(cwApqAW>N_**Y3U#1L3#%zf9GBKQ-T#Sq3%)FJso0B6*ntWCHLRAVk;jKzs5^q8H z%1pCO%1Le+!mR|2(ae)NXGG>L2^W~=!WP1fI^pT7mTFM6PMmZ!ax>b(z=-HH!EXGk zxF$7Z;q>w!jA`t3xWw4HwESiR-w}EBVdK z^a*s6IT3+Jb9iI!=lmsX{6(s$@iWadb8ZHr8OdrG%Cw^_pEDY`6DQ)vvfl}>xTL*M z7bbEy9e5?vq2NH0BjG{8!1cNKSXwB`#&9zb!A8pPb~EZ-8fBqOBZ)#w9|t}zonB2) zq9{fJJ_=Z%Ec$k1;tw%3kWt6w--)ne?eSQBSWSyEXt;b?wm`tnM5 zq{*dA@+$5AUgZ+XRlJsCW~vwJlUmb+POwo-L#Ms?cg^E{kL{+aJ;@=|e1%u-n zGEw5RU_Ws|0uEnLmN^e3XPkpqxa8PiVb|r4bEnK_kcXK^ZelTPJS}=WHA1rckf;_> zurFO^s5(u&hKhj7&T!Bo^I-mza96<@G^?XvuY1n}$8RCJ;$Qh$zLTbSHuj&R#x1#J z#Z@jhJ zwhbhE0c&*mxGfw3kmZPS%rbnn+^!EVvK=lS-Zw>KQ-_#Zb0~zkXOcPXB#g(SAJ;uA zul#04rsQqXaY=NB%$XE>MWMVlnOCL376DF1oTy85LY5}O?CUN^7=a35AJj0|em58~ zR8Q>7YLAjtHo&EfVVwC1oW>(7loXWaz9-4??ymG5x2o<6+^oPX;QZmrEpdGjcw+lFOu)f;+3q zvD)qzrNtyC<-3{b5*X&+{5V)y{*T29uxaQE7FCD3MSiy4}1CDe1P32OB^4a$^ z%89ziCU%oXsRUsXA1SnwH7f2&MuU2zc$3OGEXr985^}wcm-VjlTW`Gwg@v`$|6m8l z!wguL(#trOTdW2jl2o%5t{eE8Vionwi|sI>f=EwSFG6z6KX^8tY}|;f#C8 zzD0&kB2_zS_O-i&vN+0z3uYPXt{$B&Pz>jp8A=cb4;P>JMO`#3ATt6FY zw57F5zY2C+(gR!~^q6o^Dg2%Wt~5xoQ3Z*6N*O@4eu&v13qYvqd93QTNUJ}@Yg*KX zbggcm`WUj=5>zoxm(kOx;7rVhn){3mqjRf2Lj@{4V`m~o(mP?Oz7ns*OXzO;(Dl8r zu2#4X;(CWpU@$U*3xom_DF$Oc^&Fp-a}%Xn>NwR=_f33jgDhvzqPx@>#&Ax*^?}6M z2nwZHvx0(gJaEY2QSWTj6^Z0wdaLmebuFeuCTU>{UWUJm33G|Ze&xNQ9kZL1IDKk5 zioDx9PTX-eh};WpE`hD3w@N$ED|}2=O^qGle8z*{aX)+q2gyk+_N!s?KN*BSS)6FM z(kPsTA%fK9ZrsgFN#L6BCH<}D+7!=Q;%XCUnTaMiT?FOceAo$o>bp3^TFx7fRT(ot z6z#%x3&xk^Ik5KHMd_?f@p|=AxbvDRrg>8{W!9hvqt+zNJs~RxESdHhAz{c$Tc}e? zngzFEakvKH=7v+Q%gfC}Z(0!uZe<$`ehEacM76O*IyXNkvw|B@c?*ZeU zoDP*%^vebi#>UPdizpH2wo^#xrFYN!1F>wC3ZC6Eky!a{9} zLdS)6=$+Yc0)-D>n+lwtlQm>)OzsVfa5odT5R9VQSIXhpDC13qkMflh$-EBtm8Y83 z%l7xNN6YC99V+XX^CKZ@lb$oNYubQex>3}32^5HsON~xeqvthxFw%a0O=%TY;!3bF z#jsw(eX5qQ^&T%ATOAf!DnJ7#d#wbo1CjbSr9Z&i_3p|MlnTZeAUoB=ZPFk<=oxwy zfY=WIK&6TUH6v=27I_HhF1lzkm_N4@T1wYDySRP(wImeX2Z%;VgAtUM=$xfWN5-No zEd_0jK-VHucev1K^+X3rj|fLLK+Dz*e`FH~!G4u~;+GQpri^l#W=WZ$$&jigU}OPC zWj?CHc7W=_#Rc1RP~BN*7`cL0qo8B@@qZak^CWph?|Z_BP6xs*z%s)1fc0WhWr|eK zHSP2R_S9O5glIhdHF0%0%pY5E?8Iv|r{qT~|nl0`FM6 z{z^akF|d>yA{wurU+u272rrOlmZ)$K-69s{1{PGDK3EFWJo);WpY6#R{4a7t3tD&kVF z1MCmXW&e?#AS*)OvbtL?@oXfbN^D}lv7i~eu{GkAts@|a=XM7y^NfASrnE@gjx02N za|)l+qxO8CzmAlMs>&PKzJ-GsTXJH_rU~puDFsA()xaw&%DS&21t^$Bdo1bkxRVi% zcJ8uD(v*s$>nAemwl)1Go=OLcnU<0vgICC;9hB7vy~6I$wD%MSL}Z}9Eb8@+e>dgl zG!-W6Z>3EXwxx-D-d)om=rUq*b<5m+_Jx0%vkm7tg9T; zCeuUkL7xu#U3)SW<30DX-GWad8+BbdUbs>DB8b{S>#0|cj3$w8N*rCZb}I;d9gGv6 zB=cb3IZ#WnFfVg?Dg8Rim;VcnWxKV-X%PD5*y@FmQ_*}ReBP11#cMs`^dP;!o+L@_0KCJ2Wac+rxL*8H`% ze`bRbl;op(iv$gfLX!*@ikfTi1OtBx{#Xn%nSUN8f=^V~kj_ZGT^K@uP^a9ENVNrR z3Lqzq3E?{J`?)#yKW9MDOtwHf=7CB`CzKVcbhISKf+yyxo5780!Ryl6-f`71-xPZsH zN&aFazmO2IdjrqdGw0*21(z227BF&&u#~Odzw@8tu!ok~L z{h3xuj=O;R5x=3K_j$fn0Ze`SjQ{$uW%UgiJhL==86?w`w}oa=0H)nImUw)=v}k7T z{tMP6=Diqajk^f2_cq4dAgE2s;J$0(xr2#&`{t`$zVRTjzKZ(lx$VQNg1cF`&>^>| zMSu({LvjpMoL}9WZ_Vc7gt_)Xx-iM>#6i_qoJ}XDJRc_OGfOZ`JZ$@fAe1vI%1cpAGz~LaCf<7Y0CXT zf8apR0~OkJC5oS41mW1cQhee;8BeogahMObD!Q_=n&}v$s>G;`wx>ZX7W}>7OhmV= zCmx7hSSg}1_1=2N%?p;Y%q}(eR}ps8^OJJ8SfN+vj1U2%9#>$jqMGLTc>rE$IC{e@EKkOn>5yqJs;dmpXUzS;2HO#)Y;t|^v^bx|!_E0%@0 zfL=y1q1|G7qrT;&^*B|C1i3&Z!=~b3SHXH8Kgf%7q0vAA0$wRXvh$gxSBYw&ZlO8L zX*9>8t&1C6M#2k^*l|exapgf$J@v&us+pt>gWjbL5=aoRmzHh+jzZ)d__2oV4@vAd z?4{bjhX34k!~x$ku~Xo!@a9HXhZ>T_P4tiD?tD2k*qEp_b%6S#l23op%3K&PZe5fv zMvkrKUoSCRSxIVA8`m$`F7A|KW6Hr7cmm{O86<>#xhb8iQ{e)T%A5qJo}3Y>dYSUP?ryEnN>xPKf87IGPp3X^hiLH@IrHW%AAj&73`oAaP#Lh-)p zxjc%7E3nhDh}P26KqWNUz3?rhC6t(Q?vsXEaTn3di@%mq&~Vny#;J#px=5<$%8f0xDmQ`t{8^z{A1hV=CtbOOWvgG-aGY2RU@?<*$-#Rgn)W@(%>0+Ee<6 z1z_}Tn)F!g1xV3#^~-3|ko)1NOto=d;OVMk5&7ZyX9Lvf_Ci@Q3Ifv55X{zIY#7x#GBl+Ye)M2Wi{A*4SSF6T3%J$_v;aSh-_J}wie`8 z=nmfc(8iPV>))?F)(JD!xnwalBQ`|6TG@#za(wS8#PrmBRh0a%O%GH_X^t-F8+~FUcyRdT-wnxz>ctCMi1^9>xRy zaoR?8iyaa2{gG+a%< zV1^1)y2%U)I| z_a#mfC_Y)uYev7fDcgMW`bwqe=Ee{A&C^z5?lKo>bblDo6pwqawrh$+HXxz$Wykgx zu`=i4?kfig@-1#`ADBCXG3CB#%J+6fwR%Pxuj z>2~5nr(VHBB~eaTXQ(cQL}Y(Eckev5mMzwi!W)+UEf-Ehjo7aI4Ld-hl~D1I)YD{} zEQCzTe0s6x%2=fJaS~bWa9;j&AoxJ@(K#S4g#=ysXogryAB>oTNFW_UF4V@opG?RJ zvE83N=N&sciFo$fVjuLZVMf9@Sr-brNBYB<=)t-ppa4RyZXj4`5uD-~G^;n{#M9E5 zPuV6>5uz56iZ_fyd{(9t?IlmG{cgAZh?-p1Vwmf5l*sH28(lA6<#tfP<)SrEMeM+Y6|@;+DABpa06$^Em6FEB*qAic5U1kmV@-xWfJ!L>w^S zBU$%rOxfCvlM*CZujx7)++~=n$`BjomCqX1z`sKhe0wBi41DwiTc3T*7FJ~Rrwq_% z{J&>IE1(|q17^xD8YQ!eJQOr83@!iwKm`2%F8H7F|BjZRdHjDou{5**bxJ=GQHnDI zBQz8+C67TCnvyx?gaJtLpEXAFz={q1XX5Ukr1?Li|7c4PHOs$Ct;C4?FIZH{ab@sN z<3CCHU*PHg0t*?j{sqx9hTPHr0sb-O0EYhpLs|cMk?&vVzkkpFrCOji{sWPvNHO8V J>v8<2@jpP$E?@uv delta 4946 zcmY+IWl+=&*Tw;FmRMpzI;C8?ySp0%DFs2gB$xb?(z&#RG?Gh)uz*NO3lfroG`k=r zlJEWc&NJ^k=gT$oyJpTgAI_OMb8ehAO%RojNKI6qo3?LiM<3x~U^LKUV9;T}Ndy@H zl$o0c?J&vx)LB`;-sl3#X)W1~Uo)zO^RMxwe44j?ip%-9ST7iG=8m zr$vqr{r$y$0Q#Niaf)wGY*K)+Ak%O@n#wnS$fbLAS}IRw>9^{+w-RdYWCh9f!Kagt z)BWXAUo-Tz&h@ZM#La&5@JB2Dr*PaHy;V)<&k$xYEMcMC+&6A!(1s306IDPR(uHJg z8rEkZW-c3GG1F+tb>p-Q%DbU&x42lOIWylMUm2y`adD;xYjCZu9SQ!4r>QZ|(z_XN za?Zp*tHU6`D`PX0lQ#{EvZ2b%em)Y*6??M!#M;_7#uXo}0=ud+P`I8;sZ#5!F(?Wz zKCD0Llyx!+de(JWU6n84T>&&vPxCGtrhpkrBhq-!%wj)kC&HUauINKFs*WMJW+Tdd zA>`r*3BDBg)6kqp25M?yur@1bnN={vx3&4ysJ>w$gGhnSpRk2Ln4kmVzHB>ck1VTC zsi6buG3@PoPHi3Hi@fJflztm;U1gTbAD33N_C(e;xi6=sUk!;;0f?yfIax+M9&^nx z)HD84htXzgz`4n_Suvww4$e`Z<-RE-8Y49lzmV{h7ySAsm!lPftx#x`%o5(@V&qWb zYA=%e78b6@MTPqEXU*0V0-Ro>FI!hpTpyQzZ_)ybA1_m`h9Xx{QS`4rZ^m;iRpYef6JUt&2Zu=+ElZ?doH)!~8h#MJ5Nx)3#?Sia#zO|X%A>y6b%;1i4L|Jtr z{BPpt=^pk2nX?z0Ba@N78Zjf6RS>)RV}o`3N+$G_5cKDzbF<>c7J{geJFKOpw%Ynd zrLK?sJ0L6rq5E~f{A(tI7DzyxVw23>P|ydt5T!jNgZp4e)tEP*T(s-ppgW=0>spW( z^2Yt1b$sBuS~IZ1+Oz$ME$_jx^8|qzrPzS9(U@>zYWbhsfE5ZNX6BD-N2wBsb}bnN zW5xVS*w6010oiZ<>~`0fHr%YlQ;vAzlaZ?&OGxfv#JL1k?;j~n)ZkUk zcQxVLn+PU>h5w<_tO9pR(R?U)hCt3655~msPUEm&jAqm`YDrTHc;u#`Nk})URy%h4o znk(_u^|%wkL?`yAKJ1@_w#L|x?WYV7IonU=)8<{pXh#L31O`SWS!BB2K3N^H9g(6( zy;splvz+KC(2bHfRSs<=Dkrt|6aaE1dvGu1S%2}RB@nS@d5|h+8wvJ^3t7pVrD)Cy zdlzu{)E)W6z*7rp#-+E*jJPCu{SP&G%>-(NxN)g5;w2`|yXtkY(xPb(ym4JwBh~FY znu0ue+`2PkE&O<`ixZ?Nbw)o!Px47fZRMdtG4{;^uI{*l-cm5k9O#K6F)+!X+TRz^ zvQnn&On7GE(lO>`*K^Q&&`YKuHM6oJXqXw;rCRvy%t+ra{8+JMidbARYSlJG(}Ws4 z?IG(Y;j8Y8$6Pdb#a1qiy6elf5%Z!Ktsu2$8SyH0$HGygZA<>=xmHX3=bZ5_QUBCL zYg$K113AdI3BzcybG#(`O(4$ePHcK`5vsxz$+orAfU_H5jm8iz1{%6K&u9C? zY5Dl7DXQxVakO`}p%_l{4+V7x1S^Z163edEE}=cXmdLYK9+maPjWNRKY00g`uJx6Z ziDIiBtqVLLu{h0vhHWZQ22KohH2;okELyCv@gZa4PQa%l?d4Dl8sI?t#m?l`_kwgA zGbCP%#{6>m0bI;2nw(nc)jIDt1kU7{{0P83)U0ca-#ZcaRTklz?z5@^P135Dd@!mb z|G6)@!kSnxchp*@pRjX=jRe~(FcNT7aU=7GAEyOpy_)3kc!ht$Q>6l-inhm)OO=^4 zG`c9px|w!f!zbho0rE-umcx6czD;P8J;Bjb0#Ogyt0eiVdx$LZ_(VF_VaVO`A0v1) z8ZO*;%-pqWxuJ?f_~Mn0D%CZ1N=&uS$6}o6HGOt}j2L;#NzDuznr(I-SYi3nna&Up z_f3z|2n=eQ#Xe%k@u%U1C9_SqT=a*zBRddt;;$IkvZzLlc9PC;!4qCf%;Vq&RmXZTsl1)2xg3u?#Jc84wTpgQ1mFML} zkupZ}SpF&d5w%X7R%QL%!2RE`*G78@0PKhFwdH6Wmy@=1oQagibzUE#CLMet+WNt2F%Uswt|G4Mt9_1(bkAG zUHEmr(Rl#^#wJA!ys939T6cb<-nQ!AHN)dpik-)^YOLU7=H1~F;XC)F1C=BLW%l>V z0=JCK=a3z2BEmp3(>Vr3zcFkKjC*4E9t9IolBt`=vMA)f{-pvtEhk)#gceA$_}P1m zt}pVJ%srkpzzaLxbP%1ycXH9Sq1w!z>_;5$@VHQds{4ZBrKi^rQ^)akUO{k*cp z^9+4Qew?r0;z$evM?Rn>aDlGi^rRNfpqfKPw^CB1Vkg?4SFZbks+o?e5sj(dwown` z(VR6mNv(JTu`6#e37$robTL&sQrLFm{}cL&Uz)OmJy`dX3kA)?{xuhsn&V7pu_oe^ z)0>bxUcBeOXLXl!!xcrmv`D2O#3DhVkmpz|6NPg${En@gAs*kESAd8#<{S3Yh04R4 zD4v(hiWfwQUh`Vgs{=@jiPpg<70Tq_D%DBa-CyXf_38 zgoEPRGJbA`7~yuG!{Gg8zcm>hE=w6gx^hoP38YSP6t?U;CV&Z zQMWrQ#m{4NQNK$1BG4Gy#7L7sEdJTDm)XT+b|wsdTi9*pUX`1bWASZ6HvXVmp$~`H z?88?EbZZB=r`J;fFG(bYk~v>WDbDkJSdTm;iDk+vNM%hg?MAslyc76e!{nY;tom#q`)CjWd?O5SHU5OIM_r2&4 zx_&08_HFO$!tP&s+K&{?F})$zER?#l$|QlDIWz2yQaXW@kZ8_Oc|p*9ONm3u$n1~3 zDZ^JKc>pQQ=erH*5VFpO6;skO2Me~j|H*q{T?XwipvP+e2ZnmT58HrEQqIywypqg- zVVOaF1T0;n)^5SM&3s7*3d)-+79|0LV6uSJPn*Q)ec?{Te9?Nc;R}@y43;U! z0xmbp156HlPdk6Bs)D3hhp}U(e%P=`6aUszkY!Fy+fGx$y|A%)>i5)gxYJiJO{ku@`&2IB1ndF4<>^&AMZ3=*xv8v-cAj*>?m5W=8%- z5Y^MS=@SdmX-wJ0M`&L(Bl5|E6OxW(o1QwIGwcrRCEPa{n^ps!4bJS($m zIr1RxX^I(quIOKZhosYzDVHL;bQ$ROlaD%A2i4{Tlbdl!4>#wlz|K8h`M=#l6K!WR2M9R-NydU6LI)SjS2l}a!U=A4}!$?05Fh4#0=3LODnTeE8Vc>S>+K2X#l>wz`DGIx(j!Jk@T zp)n6l_zQg6ojG%R#syPxU7H8HYHW+nxKLnn*f2Bwv+UGbmguMU@h25BxOP8u=qE4Z zT5;kX?C$&N4xk?R;Vh9!4mMMEmg$F`jU!S$-*`|Nj2VY2Z`pRfExxmKo`_9JQnFxD zyf@O&-|bLg8O0ZMO=87!`)M|w~xjI@+wn_un`mX|GD;jbn#^D7fCyqO_NTt%PJ?mS&I zclF+GeMEdAdh}@i_s2Wz-lbdvG9r0J`H7cw#?YK}zz@$|0lU|_m09S|B^)nW3Ri!P zV>_eiy)%V(s+Sv6Tb;WuzucO3#IM~Xq~XyNvfYSYue2(A9d0g=P4wSCCisJZIo1Fn z974hb4;CbN^50hm1_mw0|3^!B(PQ`#1w9;Jhybp{LPqc(t0laY8vaXA3Co26PS4Kx z*S8U3z)E0-hYPX)9q3viTC5}%c)u_M+=>Smek?@r7vKMjjadI;31O1INJp3!3;llx s$xFzsuZ4+4_K57?Qn(oZwGodnFqr?-z`xppsl@hg9$`>F^ndC900($hBLDyZ From d119f5d9bc224343970f39a77767bf11f81aa17e Mon Sep 17 00:00:00 2001 From: yike5460 Date: Mon, 30 Oct 2023 15:11:05 +0000 Subject: [PATCH 05/21] chore: search funcion in aos & todo items --- src/scripts/dep/llm_bot_dep/aos_utils.py | 20 ++++++++++++++++++++ src/scripts/glue-job-script.py | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/scripts/dep/llm_bot_dep/aos_utils.py b/src/scripts/dep/llm_bot_dep/aos_utils.py index 41f7e0af..e1394333 100644 --- a/src/scripts/dep/llm_bot_dep/aos_utils.py +++ b/src/scripts/dep/llm_bot_dep/aos_utils.py @@ -84,4 +84,24 @@ def match_all(self, index: str): } } response = self.client.search(index=index, body=body) + return response + def search_with_metadata(self, index: str, query: str, filter: str): + """ + Execute a search query using the query DSL, using bool query to filter on metadata. + """ + body = { + "query": { + "bool": { + "must": [ + {"match": {"content": query}}, + {"match": {"metadata": "true"}} + ], + # looking for documents where the metadata field exactly matches the value of filter + "filter": [ + {"term": {"metadata": filter}} + ] + } + } + } + response = self.client.search(index=index, body=body) return response \ No newline at end of file diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index c77c65d8..c0a3a294 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -293,6 +293,7 @@ def process_pdf(pdf: bytes, **kwargs): local_path = str(os.path.basename(key)) # download to local for futher processing s3.download_file(Bucket=bucket, Key=key, Filename=local_path) + # TODO, will be deprecated and replaced by nougat class in loader_utils loader = PDFMinerPDFasHTMLLoader(local_path) # entire PDF is loaded as a single Document file_content = loader.load()[0].page_content @@ -357,7 +358,7 @@ def split_chunk(content: List[Document], embeddingModelEndpoint: str, aosEndpoin def chunk_generator(content: List[Document], chunk_size: int = 1000): # iterate documents list and split per document with chunk size for i in range(0, len(content)): - # split the document into chunks + # TODO, split the document into chunks, will be deprecated and replaced by the ASK model directly chunks = [content[i].page_content[j:j+chunk_size] for j in range(0, len(content[i].page_content), chunk_size)] # create a new document for each chunk for chunk in chunks: From 1e3785de9dd49fb7861fa861c4fbf0c706a061d6 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Tue, 31 Oct 2023 13:59:17 +0000 Subject: [PATCH 06/21] chore: metadata template with parse logic --- src/scripts/dep/llm_bot_dep/loader_utils.py | 78 +++++++++++++++++- .../whl/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 13563 -> 14459 bytes 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index 0b887cfd..cf29511d 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -1,3 +1,4 @@ +import re import logging import subprocess from pathlib import Path @@ -9,10 +10,69 @@ from io import TextIOWrapper from langchain.document_loaders.helpers import detect_file_encodings - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +metadata_template = { + "content_type": "", + "heading_hierachy": {}, + "figure_list": [], + "chunk_id": "", + "file_path": "", + "keywords": [], + "summary": "", +} + +class NestedDict(dict): + def __missing__(self, key): + self[key] = NestedDict() + return self[key] + +def extract_headings(md_content): + """Extract headings hierarchically from Markdown content. + Consider alternate syntax that "any number of == characters for heading level 1 or -- characters for heading level 2." + See https://www.markdownguide.org/basic-syntax/ + Args: + md_content (str): Markdown content. + Returns: + NestedDict: A nested dictionary containing the headings. Sample output: + { + 'Title 1': { + 'Subtitle 1.1': {}, + 'Subtitle 1.2': {} + }, + 'Title 2': { + 'Subtitle 2.1': {} + } + } + """ + headings = NestedDict() + current_heads = [headings] + lines = md_content.strip().split('\n') + + for i, line in enumerate(lines): + match = re.match(r'(#+) (.+)', line) + if not match and i > 0: # If the line is not a heading, check if the previous line is a heading using alternate syntax + if re.match(r'=+', lines[i - 1]): + level = 1 + title = lines[i - 2] + elif re.match(r'-+', lines[i - 1]): + level = 2 + title = lines[i - 2] + else: + continue + elif match: + level = len(match.group(1)) + title = match.group(2) + else: + continue + + current_heads = current_heads[:level] + current_heads[-1][title] + current_heads.append(current_heads[-1][title]) + + return headings + class NougatPDFLoader(BasePDFLoader): """A PDF loader class for converting PDF files to MMD. @@ -96,7 +156,19 @@ def lazy_load(self) -> Iterator[Document]: .replace(r"\[", "$$") .replace(r"\]", "$$") ) - metadata = {"source": self.file_path} + logging.info("content: %s", content) + # extract headings hierarchically + headings = extract_headings(content) + + # assemble metadata from template + metadata = metadata_template + metadata["content_type"] = "paragraph" + metadata["heading_hierachy"] = headings + metadata["chunk_id"] = "$$" + metadata["file_path"] = str(file_path) + # TODO, use PyMuPDF to detect image and figure list, but no link to the image for the extracted text + # metadata["figure_list"] = [] + yield Document(page_content=content, metadata=metadata) except Exception as e: @@ -253,7 +325,7 @@ def load(self) -> List[Document]: # local debugging purpose # if __name__ == "__main__": # # local pdf file in current folder -# loader = NougatPDFLoader('paperSnapshot.pdf') +# loader = NougatPDFLoader('1.pdf') # data = loader.load() # logging.info("text: %s", data) diff --git a/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl index 778c0c5a0bae9226a323a432b7b9c258e79ca287..26c8339abf97988fb45b24f7f0cca235078523aa 100644 GIT binary patch delta 5782 zcmZ9QWlY?SlgAgg#l6^4DDJvgk+QhAKyfJU6z4&HI24L5QYdc4Wr5-pr%>FTF52Sm z(0l$(?k;!p=9_#blbIJY$>e))fA5wkO91mT)&GLwemaT<0zKd-Y0?AzdaAC=P@>L# zeWE=^5lC~?j#XEbh8S7dRSRF*kRP5Y$CY7r5UYwV%*6EJeh=^39psalbPYS8|8>5 zYQ*T+I~`Ndqu^54j(gc7!f4rwDrv9ivqR#JviF>ExFz3EQV!^Nw*NM!TU(%5VW^Em zEQX6Ilrt<#YuK@xJ)52FJbZ%jQi$8ASawS7&BuAwf`%*!Sp5d{=vvecwc{+8?@V52 zKc->$q9;1Y#5V3{BSEHs1y}oyLy_?QixCcVJ%-!sHoW?dw>j9foD(Tr5eCNQZKd1y z*Y87lc0awE<}L;N{cbJ(B&yNnv0XNv>3+s-z%A8zf%rB=QPR$uD7ijGR)l}11v&Ij zYrU(CZ@73fIvccYDrLJiPqIHQWG*Y{dwSV$|GKyhAs#-fL6%!s#2dh=Vq9{ z$DP<)mrJSPYlESGB_^LGi{l!1Q$Oox**7+sK`eoU6WapMojGU3u*DhaV$Tj}sXE$G zdZHa5+hUs8LxMj&_hjXxl=$YHgmK@?e0BLTNa~j69ly?Wh{P_?wCkELdadJ%(nx`>KwsJ zWJisv^J{|?C)(XCwbQwuJ5#CD?bCglLdvHs%lvjHu->`WtaL)oNVNrpG&E9WS;bpAFZaCtfo#o0f0yY)npM}xz-yMoUGYvF;11=gm!#i6kuhh8(Cj@@Laq+- zY5ARhN@(!n++S154WJRQw)Q(RrYBjgpx3-T9;rCBS^zj+Kh*84L*Fc~(VC z`nY`vL$*7KpTA5!vK>|<_IV2M`Ru!47|nw;^d)4uG*SA-zCAEHsS(fla4vTgnCLj? z(u~Ld<_Xh&MEa=e+v7_y)he{`sJENnblcalF6n!A){sIZ2`rXDV|Nx0#=)MsGh-nP zm?u3tCN3sMVTk8f)g(#xl2LG^@1G=slv2}P_n1NR$KS-Kc=g$6QltcjE@t{(Z~NrP z2|*w_LHI8UCcx2kS>dcLs6X`OD=VhSx@@v4H{8Tn(wXKWf_G-Ia{N{?h6a_l;ym_) zjgRZDHdo!-uMqGZvMxp+Tie|Z!TOKMlAJ1fC$h}B8wY0sD!i8!r4)Wi5ysKo<=;ut zu(rkHDb&<~aD6fDwVhhE1O}Le*t=sU6hQM#V0a2b57ccYQ_5zLGkFjHF*+66a(AXJ zN=J%E>i`)Mx;@kG^tRo}ls)6=oN?!7AwL-s);hjqC{jjl7)tG5x@rK+&3Yi+6$rm^ zsSh`}i+x@p15<3%YW%E$Q*nvoEwnSXYoB;>yFvpdF~$6m2i|$1c?M+@ z<7%eiyAiWUOH-o%6)!=$17#BH+(AheFjLH{aSO!r@F8>{h%Uo@nIPN9Cn=%di)>Y? z*_oVDcomS}y%47VaK*xz)?><5&WsadgmJ|OP=&vTpx>v)Ci70Fth{(QWkK)Q;uOhc zO(Ig-SRt1*jibubJ-_L#qXuWA)lg83gUH$|V;Ul9z(m}?NN%`yV*n(}kCx;lxg+S9 z5l){F5feK(QXDr7+G%Zt#t^#Ukg-~%#(ixs{7XmPE=d-glUTsh=WK{+_O^vUWd|Mj z&8=Etg+jC#qy7avf_*IqH?WRaOHSgi4a9u4dFq{qww_<5DQ3Hm80oiiJ>JnO=})&yBYegE(QK`Sc)P&`2?d`>A%Gt5`G=x4%?tWy;}% zqNv5#gGoO(8`Hct%UU->-#YGiOcDvW`ckBV8lB87C!T{Ndb!Dk711#N;^D^2uU`aVK@9}MvkdMBk*drExpI1?V1+wytISe zuuh0`HWOlki0*hd&ie7Nc9Wc4*M$w6YXY3m<>}`e@q?I!_r@zlw+BQcqnBUf(_X@8 zEc0$sJg_91tUfw!@u{Uh^GO*i1MxWaaRxaf=jOHs+Ia8LcB=2>bHcrhvj&UCO_+)y zbl}U2QF1`t&$i5I6*@GdFh~X3?sMjYr#%<=uyP_g)nQSn=My5!1NTmHWZBoR#s}_q z1)p1^1t-`#w|a_Jg$feRmlD-H_S140cu}XYB)HKS!05Kk1gWezLJ~PAeS$X~O;haR zpb4y5=GFr*1E6QyUu_+0jaxu1T%oGrpz}^8`&~S(^MbBstRE0+P$hsfsyc%jS|BM$ zBpv+VeueA+R}Lno7de{B7bg-?;xF-jKmUo1wkQ)BBBRR@#)Xc$RLxNcy^Fn0X1az- zh>24xCtwtLbV0vBdmyoIZB?V!h}ux5LoEqNJw{y2(S0J(gi1F^=&nhzR-=Dqew$Pz zEm8gwjh(Z-s3Z5u@D1>yi2xVQ`wMLcT!`A)8?OYXXekJ8sJ|Hf=Z$bqX!RJ>mP)Cc zQXSVphB;IPbSUbWL&6aV>gG{N&ka>tK&OAbboG@D@veZ$NU@4z7*@u{$WZ(H{!K_5 zb3pGMUl=p?Rx@tFC+saVnU7Ql}l%e(YLLY!c;*IbSBE^HKB%4-> zuoOIcQR?v1OzAFvaC1R$dN}i6e~GwWXJok^AEY|2`k5m z9#lWBJ+3G&{L3By)Xc@pFVEsrXwcJqlY1ddcB1ECazTUTRwYL?8k-e0$$`rG)ueaJ zn#w42s>aXG#`C5=AYSR8ncp*4cJqxEO|M@jQ}QW%U$TELZ-Wwox~i@hudd)lcHvz1 zX_u}F0C)*y7{Ayj#(L@LVd}smA1}zlkpckj9xj0jxXtf0M*TQw${xTeZPJiSkUcGp|z&mq%@6gulg!nOZ$Weys5t1B+ff7$XiEgQSx(Ar3D>NF2?F zORJrjsBGmQ-r=XQX)O}39Ij9JNZQY}6QKIGYGFeX2CKiTHBpSwj3vmbP1u*q7jDSR zk1Kj}kZ41}R2189#T>S{j2?!1btHGnxnuHFnA8A03ktrGE=KGIE!ETE0~3;VC%v$k zI&Wm$jq>v#+6OPyN8Ky-l-X!(Ky0;Z|P6AJTM+e?IF*cq)#R;5iPJWdYE@}- z7zptP4pD!=%c4U~Aeiv%&Zi>W1k+*To&)1NIGgoRyh{q=jp-|uyHr{O+9#3(az_+n zwv^+a0jMd~9OhXLst{A&VBVO2e*0Q9eR0Le?C5u#W%4nl`iV5i`dWaajFEGyhUIvk zD>4cZ#~SIG1vUb`80+w5Q&N z)4DCo?RE`nDFj!M2zpx58&=^$O5i&V37CxQn!bzDlXOG<#V{8?B}uhkIStYAZ#GLx zJ}<`eW41%7t;?iKipsDI8n7D-n+H0YH(&|#4s%Rokx{w{P#Lq3Q(%D4{|HRB!SCh+ zOi($DK~q&aCVE*~!RScs`mbrvQfD!#mYoV)rR2p0nwajzr)#H@@7YYw0N!ig*!O0H z@kq1BuK>^oclr~AwiM9iy&RCN5QNB0vi_j|mmu0K7d@b6F ze^w>F#%m}>)@$ebN=kH^bEx}lmB(U2Pn@UP{nrf*H}L^Z4;d9)0n+vyyM?$4Z>lHf6-*V#olnwOt73=E=sR9&R2k*dDN#a{WWH)ukF8|C@^=cWsG{jGVey>XFiZMeKKUsE$~9MAFvYJv0R9D06Y^~3b_ z(B=)ArM?>@u`kI!uwN(Ds*k;i(Pkm2Wm5u2E6%i{_A>wn5Xs-cPg*sy`OIGA6jmCY_Om?DoC5Bv8$&enBwt{DqVU$PPt_{g6lDM0yIAJdE&L|GK{w!bR`F7Imo(h zx0!{~r2(R0Qq=?xfvGB7XY|^tQDe^K_OIJuQjb2f>wVuzeJWw4VyuP1e=;9_YvVa% zrN=ibS8!t?^`*3_=ZURWahpA?n%}c<5G7CzxcbJ?|FeqUL!uJSRRdSP94-;vYf{cY ziSh75Oj`m)L0m_B>(d)G7C&zDFy1}qmvkcNSOrW;vKC|><|!|wd`Ga*PF65}Nen3q zb$CDMiady-8k%T+4-!M1#$DrlypVxCB32JvW}bT`7b%KIvKSRCU^>$uGo_u;JOo(p zLA@MAUqIl~7GpIYO{P-6cN((K$}r~i*`}=iz!er=zTCL+nNCg8FuNl;;lV)fO2171 zBm-z3Q)Y7Sw24IMy&J}?P2t&dbvrP;G73Caj^VeC2wI4CZ<~8|=3kC^AA}eDZHc0w z^=?f~eYc*uXL^5JYW^Qi#Z34U^Hw$cWYk?y4Q%7<6LNj&i!>G3+uWY2&)g(7rWMXr z6C+6)KxDDVEDahc8ps%jLsWJ>6iJFHH-OZZJ+3|8$}Z2%;yZc8zz3nj00H5Z!Da7D zgM9HT=2~9fvl;icwgfuU*(Lqe`4A@#&4v3+JaH0c4z?oSWD}2eV?1P{k+*t?_QoGCtv3{6H>GuUFR9Y0sZ1bbb3(Nkp8u zP;6d_iAy$tYHq49K(28H2xuT>6Y%VbG}rVH={0my-F?z~Ggk13;&_}Np-6l8DSdL@ zp3Ic%?ic4tQe!I8*r-F`m-qcDbhW`$qTK)abl~y*&)eJG+>S`KPU7)ypMu?qf$9D4 zS0YojDc47(-SyM!BlXTkW0WTs63(ooA`f4ROZ9t+neam1Lq2@zHeQR4?cj%_dQ0O` zB$B7)-C~WK1D4)*^!L%$JOi@yajJYBbx>Ngg_~{7{dC^sJxu}eg*%{LjOhXgjmFgs z&d1&RSFgA~{naEM9h`r;J{18Anl$`?hZj(fFOUJrAOmwToSq{3xrm4&G09-|t8YN6 zY+R*sF$$88BPFcQSOY_NdHtwFA*|pmNFWxdF&c%hkaS)@qsZY~O3Pti!(2zd$foRB zo3h_*(@&z_(PHhw#Jk^n($^t`sP zO5WEq@|#i9AOph!^;{1!$-zli7-Lrq?0VB7E4LbEqPKNn_5hjRqT%Yx(JK}X!^88} zZ+_yKwiZ%G-3AK!s@FYuJ6C7SkT>puNb>#KhZq4Lk-?k(f@v{CuJ9}s!c zgIA+?Q2MT6&=LzXjajro3s=XGTbd{J(!IN*DK6Tad>vwx$^*d%#2>ILe0*eu=ev;W zSGj!Zi*-udZP%w4XB;YdjXPY!JYTkC)YD$7pdnCX>t*NfO@}LDj6_@H0lE!`h3i~2 z8-qr)JZ4h|a3jXg6La5#e+|}=x;tt+-kG^Sv7Pm(W#QD5?olT!3HlLc_ce_6J@{}e z<$SP1aZQcc{wHvr(4%PPwXKb6>zse7m1O{QX^qudasNLECpPEK6|x@ntEjLY{B>O0_ausp8l=?0Opww>ElMsLo3 zFy%e`081)9v}kF&|Cso5%nW*J3v9lvh#P(H3V`5-{M!G|p@p~b>!FIH!-)lyP?Lz^ zM8u5n00A)M|5`gB5Dn;mqannZ5`I8I2gelz!(TCxf&V+z5Yom1KNWa|8pa8yV`KQ6 zw-BU9E#!rV2(tZM>PkTx)M7q(w-7ztgc}=vEC~LKpZ>*;{C}~S5b%hyE8txZ2GC delta 4893 zcmZ9QWl$81x5k$xmy{NyOHyGa1nJlX0cjAB#-+P+SyDQdkPzV|rDLU$kd)Y^K?Fp) zq(NBX`hV}-J9F=uIdf)y&zU)2p7Y@xtUJ~rMS)yhPtQm;{kTII7XZK|N!4P3cN%EA zEeex%VvUHF*~KkeVm4{IV`@g~fc?Z(xW2pu3iUh6l{NusKg0yCSD1w`N9=b~8;718S4w%U&4#8DtY^dFY=UMU58stEL zr+wEgHiSJT-5&v0;ypdL<)4E5U_;{YZ*z=(rYx7eW;KVwBu$wdm}tw1GZlZjxwn`1 z_eZf>4L`86_a(93Ivs(}9Zs|Wq9-Ahkk20S)nWVw1=z|etNkG0@F}yV!groa+(}oo zZkd*+rFyc}D{P;Pl-9gM?DaD;UV8#`V!m&PMw?SSdzE*gKP#zBeq@IeF=Z!COpfkT z2*uznQ~S?u0-lRyzdQw;E#I3S>+$`0TB&*)`e4+WM)5hFNDy2TTI{2f|3R@RUPAey z4sEgY)+hA<;0HR{Vf&AuiyZCT8ofZ0+5EUZVy)DCIFY2JM%cNV$BF9zA zlfB>y?J(q6^cf8_nF`utGM8v7>7tHXlU$dj<=Z`nX^>vJ&r&b@*1b`sw%zqMOCkpf z`PoFYa+SaQNeO=!Gn}iLc=Px%?e7iy(729Kl@zSY0Dgc&R_)zGJT#wvj^M%98$dQk z-HIuazL4XH99E$NiStEl#5po(wg4Et$E|nV-P6 zFev~)%8!hq=Y-#C{TMj8#0nUWE6rD(x?GP~mqLD+iKA%%HYNj4#+l8?d4qnq2c74cvEKVQD)FpRri;Kj9f ze3Q+PLVesSEq&ejGvJP>be4la_-zWOPF&raDqBIiftBsyP8MEvSi132hN+`u548XBxv#yiR8XeOgj=xnnbe)88_L@K z6vuyBLrInn(5KFsE2eEB;yh;$0R(uR{C1H~r|5yR?3mCr=rjK6_==TWbLi_PFvHgB^~z#5J!@ME2Xa7I?jh(kd8W#82S)oS|ga_FV;+@P1wA&Tx=BnKI9*|QAz(hVieZQ-PXn-x^fIBhgE z_APLZ6I*?AFZtP6V0;(dy4k2$s&Cw?A3@}382$tvKYj#R zF24{18eH)wSRE1-VG7#Mkgxn(GGpE zLeY=octzvg4e`^ROzQ&tcoxRJvM-173C)7ttXU24Cw2o7GtD!onWK)oL$TQj+ZcFL z57}3nDGQ7$Gj7MEq0!nnj_w@ayG?^9imhos`wuYwgYhH0*mUcGm;MZ*;C)*93a2o` zQA%|R0<%u(YoE3Gf2!h8PzHQX>r7^OArB+I$Z%!EYN_97O#u%HflKu~_VK`7&s1K7 zLXuv7rH|nxf#e746pd3LN-@@3N)tZ%YbsKP#5de-id$-m##Y_)Jt0PLSYg#SN;s3G z=(pqxpR>TEWT2S%W$>mqJuXq1adJM@jf-JPJt^GEh0Ha}Z=5IBMqPlk#i1glpnTG* z3gr(CEvn+*63V-bIkcE);f-7M2T2Bo>gxYPkE8S|++C=5L`4A>b70aE!VAw6?#@Y8 zlY5MsT`5z(GMCd~w0t8nVGzEs*zce7T2hME`-HdDFX1eQ-8N(CU4hk&Yc$$~)9)5@ z_)a;t;zQ5UF+T3#l&eq)c)SBVn2 zX$rq_Q^PsqNG4p-s)U7jh6N6F*3#?sk2)bU`l-<)rwWvrH0d1r$LcxQN|vSTGk1!t z1J}E&l{tO-Hgf?N@ghT)Qv2m%sctrv=9y-lfH~bh)aptRf5rN4TiFBq_r6kRraZt~ zqi^XHw|NK6wi3!Yr1M%yvh`Z~2u^3i%g&c9Tq!&wO5_UFiwN?0>S%bX>X}n(Pc_>F zM)lwTie*i(UGh%0-CL>|PD8TXQYZ#(`xw$nG1XT<`ctyRGM{~q9L3r3C=zb5`<&(U z)nS^Pi`zU|Cd%2vrk?Fa#gm>(ZCVphyhoSHHlngCh5sOIAC*Ja=+@aaw!kl+8fbhB z0`kU^sf0_%qU-#4f54@>C!;JeSA7fwDH;pVtn}lFuY_xy$m4!U6XLrhaq6Hnjq7T# zw@24i)*6}RqWrQ=*elRBdwHWy`ETk!50m|e6`~(O5zxrq?DLz`ng|6CtBBsQ!d8>% z-f4V*Yo|}Iy9f7#|B_HrprDA^Y{5XI9`=JX}sotiKn6hY_cecUHZt^n`@(=O_j%_JQ zd;17#do0@{F_NJSM$nlAyI?joUs7UI3!h?Zl+C(Xgs^F8H_Q>t$ z6H#%0!v=#tq{3Hh&rFylXmqBXaJ4a1(Ao&5S!o$Rm&Y5w^6p z5!B8r-8b;1b4H8vF@7sZ!~1ri+URJ(ZzZ*Y6h5a}qS(eUZKqip;VFMT@Bru?-x=$k zo6+G~E{E{MNyAI0rmDaT*g8{JdPc^D0|>>kMdaiQLT4GOagKrDGA@I`IJ=@c&;ICw zO<`*05E^xPNJni^cclQ-3~>v6lgva&H8t&UKd|=F`8j2u(*~2%f)k|KE&9ib3-zf0 zT$qh^Mm#UZF4Bx8aky(RZ&_yJ&BAcCl{3yC?fl34p9Kq&lL_E};t+;gK*j zR%2@Gg5}imDEaiFKU~_g9})%e^d$Z>mw`@b8X~vPPRKobWyy9s>o+|>MkBe5nj}MY z)Nx3EYoBnBKt(dLqaW$(7PT>qU56OHLiA-XFYNeA>&d12jh(mLIQ4rjGepg6z)2-? z^$w5b0lvpARy{BnK7@nwuKa2q@Q2}+sPOR(rVOL*ya>n#bKTM+mbUMf z=Y!I%oK3A<#nU=g9GP!rX2WgfsD3^@RM$}oa@!<`P%G^;GF&^BJ|VE%u=fMI-MackY`g^l` zxXcL{R#Ilm;uBth^)xUdp+vmxtKCQGG*HxQ(=YeG_x`6 zZIdZQ>LlV3qO^gPa|R7ZGO8Yg2S?II*7q?xnjV;3>6Pzp(LD}>3doU{r#-$>%qTZ_ zPx8|>YOSX6e9*9g)}?bHu-Rw-A&nW*`@xt#&aGmmpf($6XM_8(c=t#8nynAxq0)N~ z9D0HvG(WU=ZT9f{_&--R?H1OmJdx8mbwa(HYao8AqV_F1O00Va-o&yb>x5$-w?_w% z_JIu(j*YyZxj_40j_A$no$hEtf41*v#$UeJA8I!Bo^}nJVGPMvRx9iOm@NILx}<8I zhuo3ykoDW5cAi8_sFyBy<@gKPhmWDx!+YE70z1<8kemIcogdWS0vQfo`<&(c@btXN z?R3Lto-%OU4*VHTfVkZM08c+%l;xyAcvz|x2NznzR+K^Ut3rFTv;{IPuU?>aSDFPv z;(hefEBVkHD7?AciY2}2XgtXNo|b%7FFI~#kf7|xES{}R=NhnCZ=4KtBI7{{igbXU<~aERH<1X)Uxviv&*a0 z8hDL|x0&VX@Fhg~=Rb+6d%nyRe!K~>;JxsJ+wIGptd>=kOr6L?+_xu|Lz130GRyy?&A2L;eRU%XW Date: Tue, 31 Oct 2023 14:10:54 +0000 Subject: [PATCH 07/21] fix: add openai moduel --- src/etl-stack.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 25465fb1..1a136b43 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -70,7 +70,7 @@ export class EtlStack extends NestedStack { '--REGION': props._region, '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--DOC_INDEX_TABLE': 'chatbot-index', - '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat==0.3.3', + '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat==0.3.3,openai==0.28.1', '--extra-py-files': _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl'), } }); From 4a2085c333b6f46fba6d90170f4c6b038c04d8d5 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Wed, 1 Nov 2023 09:51:47 +0000 Subject: [PATCH 08/21] Fix embedding model innference code --- src/models/embedding/code/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/models/embedding/code/model.py b/src/models/embedding/code/model.py index bda8a3cc..b3136d0b 100644 --- a/src/models/embedding/code/model.py +++ b/src/models/embedding/code/model.py @@ -13,6 +13,7 @@ def load_model(properties): logging.info(f"Loading model in {model_location}") model = SentenceTransformer(model_location) + model = model.eval().cuda() return model From ea2aa28688207ef08d33e73b503cf4a8c2f8d646 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Wed, 1 Nov 2023 14:08:14 +0000 Subject: [PATCH 09/21] feat: markdown split based on title & sub-title --- src/scripts/dep/llm_bot_dep/loader_utils.py | 135 ++++++++++++++++-- src/scripts/dep/llm_bot_dep/splitter_utils.py | 99 +++++++++++-- .../whl/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 14459 -> 16003 bytes 3 files changed, 214 insertions(+), 20 deletions(-) diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index cf29511d..b56097ee 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -9,15 +9,17 @@ import csv from io import TextIOWrapper from langchain.document_loaders.helpers import detect_file_encodings +# from langchain.text_splitter import MarkdownHeaderTextSplitter +from splitter_utils import MarkdownHeaderTextSplitter logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) metadata_template = { - "content_type": "", - "heading_hierachy": {}, + "content_type": "paragraph", + "heading_hierarchy": {}, "figure_list": [], - "chunk_id": "", + "chunk_id": "$$", "file_path": "", "keywords": [], "summary": "", @@ -28,6 +30,7 @@ def __missing__(self, key): self[key] = NestedDict() return self[key] +# TODO, this function is duplicated in splitter_utils.py, need to merge to one place def extract_headings(md_content): """Extract headings hierarchically from Markdown content. Consider alternate syntax that "any number of == characters for heading level 1 or -- characters for heading level 2." @@ -163,7 +166,7 @@ def lazy_load(self) -> Iterator[Document]: # assemble metadata from template metadata = metadata_template metadata["content_type"] = "paragraph" - metadata["heading_hierachy"] = headings + metadata["heading_hierarchy"] = headings metadata["chunk_id"] = "$$" metadata["file_path"] = str(file_path) # TODO, use PyMuPDF to detect image and figure list, but no link to the image for the extracted text @@ -323,11 +326,125 @@ def load(self) -> List[Document]: # local debugging purpose -# if __name__ == "__main__": -# # local pdf file in current folder -# loader = NougatPDFLoader('1.pdf') -# data = loader.load() -# logging.info("text: %s", data) +if __name__ == "__main__": + markdown_document = """ +# Learning to Retrieve In-Context Examples for Large Language Models + +###### Abstract + +aaaa + +## 1 Introduction + +1111 + +## 2 Related Work + +2222 + +## 3 Preliminaries + +3333 + +## 4 Methodology + +4444 + +### Training Data Generation + +5555 + +### Reward Modeling + +6666 + +### Training LLM Retrievers with Knowledge Distillation + +7777 + +### Evaluation of LLM Retrievers + +8888 + +## 5 Experiments + +### Evaluation Setup + +9999 + +### Main Results + +0000 + +### Training Pipeline of LLM-R + +1010 + +### Generalization Ability of LLM-R + +1212 + +### When does LLM-R Work and When Does it Not? + +1313 + +### Using Different LLMs for Data Generation and Task Evaluation + +1414 + +### Scaling the Number of In-Context Examples and Retriever Size + +1515 + +## 7 Conclusion + +1616 + +## Limitations + +1717 + +## References + +1818 +""" + markdown_splitter = MarkdownHeaderTextSplitter() + + # construct a fake document data + data = [Document(page_content=markdown_document, metadata=metadata_template)] + md_header_splits = markdown_splitter.split_text(data[0]) + for i, doc in enumerate(md_header_splits): + logger.info("content of chunk %s: %s", i, doc) + + # local pdf file in current folder + loader = NougatPDFLoader('1.pdf') + data = loader.load() + logger.info("raw data: %s", data) + md_header_splits = markdown_splitter.split_text(data[0]) + for i, doc in enumerate(md_header_splits): + logger.info("content of chunk %s: %s", i, doc) + + # official splits will be deprecated by the new MarkdownHeaderTextSplitter + # markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + # headers_to_split_on = [ + # ("#", "Header 1"), + # ("##", "Header 2"), + # ] + # markdown_document = "# Foo\n\n ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly" + # md_header_splits = markdown_splitter.split_text(markdown_document) + + # Char-level splits + # from langchain.text_splitter import RecursiveCharacterTextSplitter + + # chunk_size = 250 + # chunk_overlap = 30 + # text_splitter = RecursiveCharacterTextSplitter( + # chunk_size=chunk_size, chunk_overlap=chunk_overlap + # ) + + # Split + # splits = text_splitter.split_documents(md_header_splits) + # logger.info("splits: %s", splits) # TODO: Local debug CSV loader, remove it before release diff --git a/src/scripts/dep/llm_bot_dep/splitter_utils.py b/src/scripts/dep/llm_bot_dep/splitter_utils.py index 6efe4ffd..92a5e0db 100644 --- a/src/scripts/dep/llm_bot_dep/splitter_utils.py +++ b/src/scripts/dep/llm_bot_dep/splitter_utils.py @@ -1,19 +1,11 @@ +import re +from typing import Any, Dict, List, Optional, Iterator, Union from langchain.docstore.document import Document from langchain.text_splitter import ( RecursiveCharacterTextSplitter, Language, TextSplitter, ) -from typing import Any, Dict, List, Optional, Iterator - -# TODO: rewrite this function to use the new TextSplitter for mmd type, and this class is not imported into glue job for now -class MarkdownTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along Markdown-formatted headings.""" - - def __init__(self, **kwargs: Any) -> None: - """Initialize a MarkdownTextSplitter.""" - separators = self.get_separators_for_language(Language.MARKDOWN) - super().__init__(separators=separators, **kwargs) def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy try: @@ -75,4 +67,89 @@ def __init__( def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" splits = (s.text for s in self._tokenizer(text).sents) - return self._merge_splits(splits, self._separator) \ No newline at end of file + return self._merge_splits(splits, self._separator) + +class NestedDict(dict): + def __missing__(self, key): + self[key] = NestedDict() + return self[key] + +def extract_headings(md_content): + """Extract headings hierarchically from Markdown content. + Consider alternate syntax that "any number of == characters for heading level 1 or -- characters for heading level 2." + See https://www.markdownguide.org/basic-syntax/ + Args: + md_content (str): Markdown content. + Returns: + NestedDict: A nested dictionary containing the headings. Sample output: + { + 'Title 1': { + 'Subtitle 1.1': {}, + 'Subtitle 1.2': {} + }, + 'Title 2': { + 'Subtitle 2.1': {} + } + } + """ + headings = NestedDict() + current_heads = [headings] + lines = md_content.strip().split('\n') + + for i, line in enumerate(lines): + match = re.match(r'(#+) (.+)', line) + if not match and i > 0: # If the line is not a heading, check if the previous line is a heading using alternate syntax + if re.match(r'=+', lines[i - 1]): + level = 1 + title = lines[i - 2] + elif re.match(r'-+', lines[i - 1]): + level = 2 + title = lines[i - 2] + else: + continue + elif match: + level = len(match.group(1)) + title = match.group(2) + else: + continue + + current_heads = current_heads[:level] + current_heads[-1][title] + current_heads.append(current_heads[-1][title]) + + return headings + +# rewrite this class to use the new TextSplitter for mmd type +class MarkdownHeaderTextSplitter: + # Place holder for now without parameters + def __init__(self) -> None: + pass + + def split_text(self, text: Document) -> List[Document]: + lines = text.page_content.strip().split('\n') + chunks = [] + current_chunk_content = [] + chunk_id = 1 # Initializing chunk_id + + for line in lines: + if line.startswith(('## ', ' ### ')): # Assuming these denote headings + # Save the current chunk if it exists + if current_chunk_content: + metadata = text.metadata.copy() + metadata['heading_hierarchy'] = extract_headings('\n'.join(current_chunk_content)) + metadata['chunk_id'] = f"${chunk_id}" + chunk_id += 1 + chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) + current_chunk_content = [] # Reset for the next chunk + + current_chunk_content.append(line) + + # Save the last chunk if it exists + if current_chunk_content: + metadata = text.metadata.copy() + metadata['heading_hierarchy'] = extract_headings('\n'.join(current_chunk_content)) + metadata['chunk_id'] = f"${chunk_id}" + chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) + + return chunks + diff --git a/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl index 26c8339abf97988fb45b24f7f0cca235078523aa..6072a808c9e103090f6571cd9235b92b5f6372ca 100644 GIT binary patch delta 7348 zcmZ9RWlS6lu&x(aoL$^qi(7Fk?(Xg`MOtLB#clD$i$igzl;T#TcyWg!1&S7TZqJvT zo7_9eWRhnlGk@M6Gnq$^@Sp}?VsNU5l|k`Y*PS#T0N^VIk0hpr&Wo;ktqL`rJz)@t zbL!y7HXhqDDYWF&s^+=|F64i6JyUfP=8R0G!E`bI0(4UFd^#)d@TB!iIkDXrV|N&( z4V5qXDfm&4CZ(pS(^B|0!XMUL&6Z-Gtzo2Y01?>H_J@h7EU~g?G-mOx2O$ooz5>l?WA5$#-CH5&DT?3|+ zEB7IP%!A*8B)3OCwSb~iv^ZY@b#gZ$Sa4@E_qp4%23Yk?&f1`;2lN(DPR7QI zt5@O*bBaURQJT6WcS=Svh7wk3<&4zo^ep!7O&@9N!m>8c{apRQ(16<_1rN#Lr3z)g zK^VO=CKH@b`ObFcvNMACrx6v1Z17)oxuRNlk4u;CQ-0zeZ**V}WlD3!vRBxwUaxvAcQugeX zOU+mu^*aw3TGg%^sxHf7@?4dRn^1>?cV1%Cp{rCbh-u{991Yy>3gGtwujZmKw6 zUlt7XnUq729LUou2^X0fbt-z037Z~jM7K|ZS~}At;QF_BDU_|w&dzH=t_3xj9u}Ro zqmjo)#T2MRxY0bULbxhLiF!30>mu;OT)(p(M3%nvO%8uV?ZoOY_KJ}JNnLIgI(?fs z4@nkc%9b5;MOW;bopnh*1DZ_vv!fMU`w)bUiVcQ+HYysHWih0Bj66vsS+$PF#H+}G zYT_B%60M@P>@~YkByv~hZpPH45iKD$;pHieSv$ugGk|M%9C8#Pv`Zl%uDs9L3Z%nX zAQN%rU>nrF4LeZG@bw|eCinWrt?C;HIW2}sDdR0GcVV1j71F0{n|zcxO7lIVHjJn) zbI|1Uum0m}iYql%cI2-ul~pZgFkCID*D@%Ghs4|*2b~-s9Zx<3X>dx@_v|ttA#d!^ z(KnZiOo$CLj?jhNEMk#mgQMmYzmOUpf~GL%q>qPdwYN7s0|RlYoTfbKwUIp18&v3l z71!ly_AkKgyx#{$mLx!d5dNazNaO7rxmA>H)a2X!VTI9bD(U!iP6#JPB@I+!-tkqI z2z8a{9K|}bcm+()9h?IeeD^ofH?`z9_a~O}&l-RdjAF}7_KfAV_c=O=Kefhrw8M#V zNd+b<;xYM_bihq1+fh9{t6Ch+{jui6TRCNReR&goC{*@3!V2yoo80s>wuW^qG>79d zqQOcCIkus(`CQYGFwly}N*I(G_!+q3o+-wtYd;%m^o3o6p4JQzn}sIu)4q`eFI6q0 z{>i#5D`pIW>4UloH$1|fQVl_#7xr+9Zo$gBXnFy)kZSBdd zat)c|yU6RmU8A3W;MEY`Xpgy%IgR~ld}%dMQYdm8LxJ~Fkw1k=SD|m!rnVpJgAH4J zq~^E`0-1;qzbx^C$YIbT_t>}wV;S+df#F7Bx z4+6*cteACNbAEl(dvQUVycgI7^>jERMr5_Syll*elvvY>C^(YiK;fCb-tS*slN;7O zNKV0J6{<*vCQV_%fC)S_mg@BnM6g&Z!S9CMIa6mlVXWD=56MvGc_t*Zo}8p^yeE?h zWjE57$Tq@nc^f@f0;?I;B$_&UME#wrt6#UL6R##|O{qj})WI4X+M-jJKB8sgJC|$d z3~9(;SkPI_KUGi?X?sxd6*(Tw_r`(i5Ak#FHl2*N4T@!p84QoTC|1b zP|MvWX4IieYcE47vDy016z*z;yf*osS5il`+Ni`1^0}`9lxrb-Ko4J9Vo9PT!hB2} zrnVZ=cOpDW9hLS8H%M-xXB|Z`G`=J}<7Yf3d?Q8z&%_0U7d$#{n!s#QE3eeX7^_{7 zdSR$>qk-H>+!jO}X%5rH{ye5Y2yf_TCjY#DSpJScwLWxAKnHF0y9kL?yw7rL3dE() zX62mPl2Fb|E3vmG{7sWx>K8E)8$HTX|AC;^_GRN3PoGdl0eDHv? zw~Ej+8sh=4p%!!|LWUKwV5de)W%T+4bTiiKKm0QgPa|S-h{_RbIH9T)tEJ76KB&DB za@azb+IodOl#nDxjrr<5dXR4Swo1m5N-0)*JqV$IreAXCVnduHyi&T8#V&>M<|AW+ zyB_j(%nk1idb>m=z_c{4qjR}=L%x}@c^JxL3EMJj_=K?v@qyVSGf67mNykkV$+S_E zFI7WQ{qg#^L-LSVDFAC6I&gzP8f^6O5)8LN{?TdsQfS;VG6`c0-p=>{TV;I02JZ$8)5K$^ zicoBSLcV+(`K&)a24bv|A42YyuC$D7AGR9Fcqe#k%DQ3Jhatusq(zj=O6m^4l~7tvtT-iJlR#e(0PA9VI<}qKLVDHeM5w-dagJK~oucIb+gA6Ooac zK?WY8tK?w)mc53lDq2WCzJC2Jog`ZA=(QFuA!XHX94vILxpLi@`;BW|UU~bR`me8n ztljF@_crLX0^h?F+p16uEcviwfAlWmiYVVuQ3QlVM;`o=8o?Dt?!f-|1`SG3Lys5K zFI2}ZQMsX~(-y4qauVK>q59}!_sn^7W%?S{$VYj1#7nl-Td(y@*B9Q#Z*Jfy3szK8 zcMjXf=kG$l!V0?CN&3zz<4s6_Tkm-Ae(!LuD2GVayG(y#P6)gb=7Eq4jKD`M26~m=ORk0#e4=R7#1Zv(RkPEhI-yM?HQPbD#b=WS8(9JV93feP%IQf99$0<=tS) z(C55*(YWujeDfB;N!zLJ75F>N=Tmbl{6(U^Zhy{vWA2dAlC?QpCQn8^^W^C-91dn( z#I3DFTUWm#Z=GDbHu};<3xU%k@znH0%4m$e#xdg9-4l$-g06eMRzlT&?+hh~Je|fa0}s3^cH9ymJiDz>8O78y}NT!&a#s z>M2)lILbzRf?msTR&kt7%zb$;i<_?=a4NKDa;Ka(o|oJqqHtD?)Qu90J&*lb0%^|+ zZAF%G(85~b=cp9

Nkc%5V!TQ~P7ggTnt}n<`6}ry3}iw!G$gh*pAq4m3HAQD|!q z%KoKg#d!yEh(6o`wUp4%DvtzP~){LeZR)w*%Zj~;uN7335X!kwpZv0}u&1>u=1 z7DaLsNWa`XUf<`umXXb9JXQ@vpeOC7NcCHu@@w-a%G54;by6&jbmPTEoc*-ivRd(Pv1mKg z$;d$=*9JSs-?=$T(qOPV&bS?At<+*2F`tGW2?NKMD_vv38#Df!vp0-M`^~6Uk3&Vl zn$~A*?EIy#bLD;*)piM927WpU1rAxTJV|M>MM0$NyfM$Cvfq<9T=&OUrbMh;n(Nf( z#%X&$Ii#Gus?h%Sgj+0I)7m2%mc@2Lk(0uG8-DwJEeT%bHiNhHGQCcp=@PWN&p8HV zyJ_1x_cEvqQ@V*Kbd_Lwo8E5Y-C0@b#_bxZxa=D5KU1 zTej|z7Y^>uMlTU`I6^FvO)6vcsOZ*iCx`CD*HL z!D4&g$swwl`Gt^m{UxnHK-_C@;f_g!IK!{w!}yK)=|aw1kI$Z8`e!+2wF;b{M%Y8? zC!NbYuUG1)`09cV`!dKQ^mvTQ5@(8->iuXH&Te^|+xr_RIUU^tHnI%Za)13Gx}5Lc zvnjD5aXEpbhfxXQGZQ}UG2-n*J`6y*Bik+@ma6yV@%J{8+3vj^*XB-|hnDZ*E84b2lP>Thgi(%>-}pTZjVA^^QI#jI3c9A3q^)21619 z{%qBK-FocHlX(=U0v-$b?FPq`srv4*EP5W4P`6&^RUVHfX#WYSgAn)H19qHUdqfIJ zu>+uBsu<1y{`|l<`Cac~?el|(HTGXl)k_FeEOwV>680T>R(%axpQbF2eC0LdUH7>X z3xk&r-_GDTqNe(hsm(o$-9O1BezN9EfEb4Co&^v+l8eq3@FeBd@6s5@N+DDUCO;wp z$U6gD0`KWqUgeZgbzZ5PJD6sGV3+q+1~Inskv%8?KoueU6B`56aNT`X00(+&;n{#{ z=uBbvqXdDNc?FEQ;4L>R-WEEb_+`SV7?@JEL-@Su>q~HLC8`9koeh^s8Gd|Pk1lSa zriVri?gR4yH=VZ-%yxF%Pf*!3vV@O{)Vp62Up-U7b#ZN+MmKb+p%|z) ztDg0nlDF!+a(n9wM2LjJ3mvBF$`FV1_q|qybeZzlATO6orK8VQYq_N>Y5$?hMK6IR za9gD=)tJKzr`gKr>zW!wN~6{{fc4DU$ne|lbE{o;CWj}KB`syWq)@on!B5Uql5|?Q zU4xu$MwbbnS)1kDx(jKvUw@f@t?*`o$;AU7M4m`nKZnMednWO`FsLvcgq{Z;_;o%Sj)<16NoqV} zCg~;+^93z7mzu;B^K-fxdsrgq&?~Gba!brgv5042>Y-UaXqK#DqWfd_KnXFIfQeS! zc-r*v6G`0JhjmX} zZ8#40`>Vr#cd&ov$nOf_ou%s-@pT!azm(a1H_dHRDq!B~PvmaFU*!2hZRr18QzazP zrrz1X&=S9g_2LULPd9O}>uT)Zd97ozlz^|}FKa^cr(>f(bkoo5p$Wu2fN)yJRChHXHZE;l-p?}FO8n@+~fn;F>i5~ zlF|Whu;aoR@z$Z~Y-4(#(V%0Wsf;aM;w>sC2KBT^d_;xzTrSHH%KK9D?@L)kCXf#9 zWsog|P6tuj&K9qai^LN_#j3))u$8@;)r%cUpXPe7pcu$o&y;4j%Q6Wh@9%b&koVhn^g2S~gbL|6r71~hM zeI^Fog5TAH#&N*xPc$kxh7sngzc)=3fca<(7;pM`&8xF!!^wxwiq{zKiJlMZM3;Id zTeQ%NrAyH`CJvit_19=0A|i{XMW^+ARP>ybE|rc^B?i>%<*ZSx^TI#$*3QmG$8!GR zg)%bM)N+hp9lLg{%Nx~hRY^w{cbY=;)-g)9b;_qmHz9JQbKh#Nkr5GS1-gbul8#p< z2L%9GiG}wd=0ZNDOwsNHeC`rl`1B)x7+Z>!V0dSfVUJ0$ML>|tladjG2(`v7`0l)_ zz2#KQI?f%A>cp8y5YOTOH+Rwh)wvg+jl!k};|cRnTqg4*LM5Nnnl;+!;v! zGH^ehtp6fDuQvMsRPyD?21~-<^KwCzQi~-4k-vfpklF36eq!1-u^@N??BIRLs!R-u z+GyoxdIpY5TEutTNw+x8l40m0mbVC!^ZLLLJwY8A(htQ~CWc2*oAVbwAL5-n9%duH zIJ$UT{ra^KQ#ZS8W&3x}lW&UP5>woYhpJbRl~2qmyTvgakY^?P_WQa##}yK^M{O=* z@YDK0F)kEu@18f(uh>s37}?5X$PHt2TD>wWTLHn4qI!KfKEnB6wT`ICCa(Xo9PdMh z9@V{)HcN&U@ULD_azJh>MytCAMPQU7mO|%rSEqYwMf5M6M!tGT5^P1nP^!w!pVQ1x zv$MZ@fsVrA{Kftuld%ruMqV(S-X7T15o-=kpb*i;y(mz9|6D=+ zmm;Y6s)ikg%4ic?6Zpd&OB{fCPrg{%CDChv%orDTRh)Ptd1|9PBpOHuHNih_leIn) zsNQ3)%jsa}%Na1e;Q5h)-9A;}faj*8!aK-vzj;XOGL=OnAkGPsw_&5>%|8F0WfDz&W3*1wHk>>wh z!~g)wf9m{S`x4r~2`^xyhQ|qD!G8%*Ay)Cj5d@k4(<-|lIbw|fypNj+?k5xXhOm*c{mFp^=$?p-Oaf5W+Kn_i#cUdmfRR7QW3f#i59X4rHR*T* zAUnpWcgly=sV9xDcRY&^b!rvyjV*bl^KG^&UF^D<+>c*gI>2I!G012MDr{aJC;;w$ zFsqV{FSTJcvVmSrgpTJcOy>-NxX#w^3k5$JGtnR2fuh`rD5;|&3?e=dDpPk21CvI6 z-v@7Uh91lL-u!5%O8yq!`EDi~82)rFwmr7vl5%vp!~~|a#Q#(P-oCGU0_Tz9Z()L6 zNZDj&sIi$L_vC>4WR10dnLrHuC$y_1v?}#5JD>lMpWlZ|k+B9mmCni-M zJumE;i!!W=pb4Y5pMa)DjS3xS;kV2$fjYg($(A$$x;)GtQP z@qlJcnm7vHVS@0}60f3#)XcoPN}e4%mDB+}VWw z*kUzZd@@SjroRg~!>K(d`scf`f?>PES!k#>go6=s&dR5F%$pj>Zh()Eb8q}AM) z!z4c!-x~AA!6@6r0rhhXkX`c%qRjv`$iHdeaz1G=OZQpfCUh8hZ4DtT!l!cm~5u?57{C7@m#)@a=avc5ttE6*HS3)mHURqHK~z;Dr>6 zhSnJYDJSEE2yN-hs74#LZ9(=rSstPhZ_;>n-pOJlcsbXJ)%Vs3Skb#z-ub|+R*dg! z4%DJ9zSDM``{e;iPMsjT?sn|u^xCwNxAzKY{i$#Jv59-&k-#F+3L>63c`|{_6;_a- z=8F9z*DkM3?7)f^FwJPc&YKON@|%MZKzr(J3;J0yCiMcpj~pP&o?=snSA&1vk&B3UJJ`S2T` z-)o)V_d_XJC2p3=r77feqzghHPrni3%qzr3C>ZfZ@#A7IRPz?YuM*GGIL_fRQqm0S z$#^Bc-SD^YUTESIC(Za3@^-ALNIM9X@2ICWZeT2qSou07%Q+Rna{O-2hY3}xGWECd zgn3)@21>60GsFAM5Ms2@F3vW%7`v-4NexZgS`^+;fA;yAp?F?o^%&fVUaf~to7h!> zGg1R|AnBGz$r}pl5!A@ak5rq(WqrKxyNRdHwul3%TE#npc*n!ehWUPN7}3u8yzdGY z#Ywo?LR|EUaMMcRCH;N{`RPw1y!jX}2!xL=vc4V=dyzu19vwlhx*wP7*hU+bPQoh5 z5Ph5@-yH&ODGGnT$%Q>kVDK&e%j^!#K*D+x1BsyqU+X)&!K{=WhezHhBRq|{yZ{-S zu}gYijIV%RBTlyH%ciAyCmp&THq(J#s5pK1n-|TAvLi@mp_}bzG@3`;yhp}x6I2JP z1QiXu-2-Z7l9U&}lG5t1GJR0GFHU`A>}qkwMBrVeME*JPP22=8HeZuP-=+gSGIFve z*xAwVqW*c3+Oz5J)4!c(8*Q2&KTM;8seD{;xu@)i6@k61ZJea7;!l0%@$S_QOBDbJ zktwjhbxBF|H#R7QvlgRYvZoSq5SMJ!j`_|5aGUYl&_9*}#&v~><|}Mm#z|m26g**#Mb3gEx};9#a5h?A>&QZ5 zv+&@GG?Pbfo?_`>^-G|v%S;CZJE%>M5Slzx{ZXr#c8qB(SyAhYONDary29+ZY5*^# zJ{(L>yY)oMb(0@?Gu&q&yImoWP@n;DXc^nk!bZE<2^;h@k4N?`C_CJZqY`QZ)=(Gf z_rjQO{55YWX6t($Gkky-JE*I7S)EmhRWj2f|x&4S+vhy8^!pkj#iKuZJXvyO{Ne0QE@yPt!u7d+nmJb z#IsC3P;6)st?a%HKKaHyT*m-Xo1vwYkE-kQCsqJWxO}xrDXcDY-{R1$DsPDb)%`(3 zwQoq?uwW-czWDVN(vxj~SqK5*({VwfjrusD1r_PWtflum^m>E(N3sy5L)tMXy7AWl z+>&4h|0EAvjH6(vU@WAtVq%l{06M-I-TT3I`0|x7F+DYO+BautzuVz1poIl+owSG16N7+;@7ne zc+NIB=F0_$ZpW}G%9s3L;f=eX>q3LZcl%b4_Q()|^Of$9$9`v6y2JH|5QEn+*3ThY6HO6d|RM^n7pF-0dNqb;`1vala zXtK({!uX9|IPRK$eN!f5##em$Mfc)1Ib~^)W{zv=Da>TrH4k8MB6J><_|%FlY0c`k z2>|-h?tj8?7NWeALlm6J4hdcdOWpiAH9*^_KwLJDLQbzEzjR?#EGIu|x^1d5ULJ>O zUKa|c*UlVB_}j}D#YTdm%BrYJ&PFpTP>BqcIOatSrMcE9UQujdCnICc9rk;_YbI9l zf-c_>_v->)QNZ8CM99>hLBePiuNI?#%Fz0ftQFWgZy#)+xKh=R+L55DDT6+xs6e*2 zGY^UNsuV7AM`)xHsZt4vDjms3U^byl!X@zPE7152gmfN#T;-bvO zgR{T=DWK8%3|HlG30OwzgqauRd%Dwcm&RePEXA;208%pSA&phd$9}>d14RLsbUQ*Rq8{^<64{ zl*HQu2+jQ2RGpQo2Z(7Cw_iF{$U+u1I;fh0G=*X&xg7kUL{k?>0~G$jBSoiBHHYKi zj28a_M@)3RqHy5Ua?W|6t)Q(B6}!?-ER@ick$vHIQYeA}P8ai7g(u<+~yKwY1-lEtZw2)j-K;b#= zQHFpJZYIe6K+G1BSmz$&E4-EtZ(j+o$J>6c^BU$y(q_aq-oD?R7W&3j(QB*SDiUDH zl#G(AhTMc^Xz-t~>TAZ0c~rPOZb!)727X=b|40@19#JmET^#-==jOXUi3dSeQj2<} z0KuAxoIb-Wh5a&di?4mlQ!ZZeWcop`AQbDfWzt?sjcEQFwEC}+GReJW^=zyJ-{86_ zJD?xcL>nAY)ScMPT^*j?w4H9C;`sV65g)Nu)0`hz>Sf_ zjU=|2h3>~Nsk&p-Inm2A1;lOL^1kQvJ^!>4Rq0qRWYHYH2kTFc%oC=Y=MH;te^<%- zQ1q0|SdDM9rQG-JhTM~PcrzwEllFhmDsvAXu3rRBWu)p@T~Qth;^B5@{mKGfDd_&B z%MsXamxwX89l^(>3+{P&@0lv+k{Ch(0yZtswuas3oZ^RCsi-<1`Ee8BDEtLMv zi4hVynf7UKPiC?Fx?r+A8{y8YJ9nK!B2CH3%Tp4RX5rgmj`Tj_=@2g#QnX&9#Sc_7 z*mFy-NXY!0iZFJJbB*sOnf5kJxa2~1l2NC?aSxp{*aj)`5RaY`c6erK#!7sYr_VA0N(!Koh~e@jea z($5ded+MiFN9#S1V{}JnG9KJi5;t#4%T4+yI7lL%LSMY?F<*%nP3%Nt2gsAqrqE;- zToR0119kydtk>}lf`f{zC{0*rEu0x=?qW;#N0w09p00@W+!av2`(P56QqBZv0uI^y zvV{WzIg|YTQ-jc5vTQ)7@uKG9$l?bi^h-=VRrkS4KA$~z`w}$0bZ<^h67$1hOSy-4 z7qzII5@b#vlRNpPsZlgR3T7JDo?$%*5-3TCw6^7Vx)j zHt|1>vd!wNG>PMwLYc}iiw4~iEIoM9b7*}XyD3F=!-g9mLF@&!gv|+)wMCG^nGjKi zWE-C9kW5=OAI5r%=Pg$kcUN~c?d}HCfw+3cqs&RtfW(wf0#uWd&FT|G?blGnNhY#OJ+WJ@Q$U1M^#trd6vl*Wp zk=x+ACP&rpL7P}%-e!XuX@er{n6W~tkGty_ob(fp=8(Cx^Nq4|YoyjL!Nmo{_9bS% zdxTsY&^Jz3?95?4H9uh_&Pv910Pk11<4RuJXHM&5IrzhM>V0?cD&5GQx51d~^c07wmP_#@l zHTwldbV#-I%IfBsf|C$-a#v|Z*2ei;PU?5Spc31;uyymZLD`z0!1o>a&sxIUZ9JLwJ??`JC^H{ z-Lwv-6P2a1>yKR)1R5$-)`(x=hgtyLG9&OP7STfsMQ78RipSF`%=3hn5t{;)&a$G< zgkJV>#G&^JZ6`m_H`PA&5Ok8;^+EQ-!V3NQ%M6`aPE#yb6rUH9V!9fVZ<=v#`sZ$i zA$Oj!sB|?H(0et@)OoaNnq(jIsriyd<%UF069< ztkA6zeg9fO^3}hV5pF>F*Ka}(;lh!}27xC3UK{s7Xni?W^cy%Kpq*5t08&^B&Bt@U ziy7c2Cy&LaMz|~+f;4&f%asyTWN$~yxEZ-aBZP#4=_R4u;5X1v0#IW-7Oa?R)+D>c z^+S5=L4U(c=YYh9;z_%D@Ym*V85;%F>XY(Y(Jn6xXaX&%rxA*&ibX-6p}!u z{;jlU)BGuOTw&jfx&ry3-=9g~tlcA^1ujk|t9a8^{?1|VOh1N&YacgDVw@S}+Pcs5 zzI8AlY5#%B%Uf=Ap(l+=mFKJeMEA^{4ii>s_TjR}#3MDFv+ov=y&dH$IwB=b{>~u* zEW{G#YdG_QK#$poc&(>yW7w#k@7Ih$;+XN%l>DdQ-Jx14A2)rsD=VKno|9g!H$?SR zdko17qQPXj{SBY{ZvuH2@?JP&d1WMQJqw*B^DUWv?BuA~HWN~AZ}%L&esI~b!HKw< z(2*M=>auZ(7NcHFwDqECDT#CA+Db zKZm?%R#{W*b}n?V?0x9}Z&!bX`#-B8B8u;C^HRqy7D5xr@j(C4DQN)3%jm!HlfU5n zU*cce5K#*It6GtNNrk1thvt8s|A75JNN&RaNL$!M4S~?o)5OB2#`*6;`6o3btIr?u qmks`l2mf&jU*z9{Ab-g4Z~FhNmH%TlU?KmT==6{le?jb@z<&XJ#oN39 From 6062c28068f2b160a095c7d54a07abdc803a7f2c Mon Sep 17 00:00:00 2001 From: yike5460 Date: Thu, 2 Nov 2023 02:50:51 +0000 Subject: [PATCH 10/21] chore: 1.update whl to dist directly; 2. add table split in splitter --- src/etl-stack.ts | 2 +- src/scripts/dep/llm_bot_dep/loader_utils.py | 118 +++++++++--------- src/scripts/dep/llm_bot_dep/splitter_utils.py | 28 ++++- .../whl/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 16003 -> 0 bytes 4 files changed, 85 insertions(+), 63 deletions(-) delete mode 100644 src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 1a136b43..f4dcce02 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -44,7 +44,7 @@ export class EtlStack extends NestedStack { }); const extraPythonFiles = new s3deploy.BucketDeployment(this, 'extraPythonFiles', { - sources: [s3deploy.Source.asset('src/scripts/whl')], + sources: [s3deploy.Source.asset('src/scripts/dep/dist')], destinationBucket: _S3Bucket, // destinationKeyPrefix: 'llm_bot_dep-0.1.0-py3-none-any.whl', }); diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index b56097ee..362e4e9f 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -177,7 +177,6 @@ def lazy_load(self) -> Iterator[Document]: except Exception as e: logging.error(f"An error occurred while processing the PDF: {str(e)}") - class CustomCSVLoader(CSVLoader): """Load a `CSV` file into a list of Documents. @@ -324,105 +323,108 @@ def load(self) -> List[Document]: return docs - # local debugging purpose -if __name__ == "__main__": - markdown_document = """ -# Learning to Retrieve In-Context Examples for Large Language Models +# if __name__ == "__main__": +# markdown_document = r""" +# # Learning to Retrieve In-Context Examples for Large Language Models + +# ###### Abstract -###### Abstract +# aaaa -aaaa +# ## 1 Introduction -## 1 Introduction +# 1111 -1111 +# ## 2 Related Work -## 2 Related Work +# 2222 -2222 +# ## 3 Preliminaries -## 3 Preliminaries +# 3333 -3333 +# ## 4 Methodology -## 4 Methodology +# 4444 -4444 +# ### Training Data Generation -### Training Data Generation +# 5555 -5555 +# ### Reward Modeling -### Reward Modeling +# 6666 -6666 +# ### Training LLM Retrievers with Knowledge Distillation -### Training LLM Retrievers with Knowledge Distillation +# 7777 -7777 +# ### Evaluation of LLM Retrievers -### Evaluation of LLM Retrievers +# 8888 -8888 +# ## 5 Experiments -## 5 Experiments +# ### Evaluation Setup -### Evaluation Setup +# 9999 -9999 +# ### Main Results -### Main Results +# 0000 -0000 +# \begin{table} +# This is table content +# \end{table} -### Training Pipeline of LLM-R +# ### Training Pipeline of LLM-R -1010 +# 1010 -### Generalization Ability of LLM-R +# ### Generalization Ability of LLM-R -1212 +# 1212 -### When does LLM-R Work and When Does it Not? +# ### When does LLM-R Work and When Does it Not? -1313 +# 1313 -### Using Different LLMs for Data Generation and Task Evaluation +# ### Using Different LLMs for Data Generation and Task Evaluation -1414 +# 1414 -### Scaling the Number of In-Context Examples and Retriever Size +# ### Scaling the Number of In-Context Examples and Retriever Size -1515 +# 1515 -## 7 Conclusion +# ## 7 Conclusion -1616 +# 1616 -## Limitations +# ## Limitations -1717 +# 1717 -## References +# ## References -1818 -""" - markdown_splitter = MarkdownHeaderTextSplitter() +# 1818 +# """ +# markdown_splitter = MarkdownHeaderTextSplitter() - # construct a fake document data - data = [Document(page_content=markdown_document, metadata=metadata_template)] - md_header_splits = markdown_splitter.split_text(data[0]) - for i, doc in enumerate(md_header_splits): - logger.info("content of chunk %s: %s", i, doc) +# # construct a fake document data +# data = [Document(page_content=markdown_document, metadata=metadata_template)] +# md_header_splits = markdown_splitter.split_text(data[0]) +# for i, doc in enumerate(md_header_splits): +# logger.info("content of chunk %s: %s", i, doc) # local pdf file in current folder - loader = NougatPDFLoader('1.pdf') - data = loader.load() - logger.info("raw data: %s", data) - md_header_splits = markdown_splitter.split_text(data[0]) - for i, doc in enumerate(md_header_splits): - logger.info("content of chunk %s: %s", i, doc) + # loader = NougatPDFLoader('1.pdf') + # data = loader.load() + # logger.info("raw data: %s", data) + # md_header_splits = markdown_splitter.split_text(data[0]) + # for i, doc in enumerate(md_header_splits): + # logger.info("content of chunk %s: %s", i, doc) # official splits will be deprecated by the new MarkdownHeaderTextSplitter # markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) @@ -442,7 +444,7 @@ def load(self) -> List[Document]: # chunk_size=chunk_size, chunk_overlap=chunk_overlap # ) - # Split + # # Split # splits = text_splitter.split_documents(md_header_splits) # logger.info("splits: %s", splits) diff --git a/src/scripts/dep/llm_bot_dep/splitter_utils.py b/src/scripts/dep/llm_bot_dep/splitter_utils.py index 92a5e0db..eab88313 100644 --- a/src/scripts/dep/llm_bot_dep/splitter_utils.py +++ b/src/scripts/dep/llm_bot_dep/splitter_utils.py @@ -129,20 +129,41 @@ def split_text(self, text: Document) -> List[Document]: lines = text.page_content.strip().split('\n') chunks = [] current_chunk_content = [] + table_content = [] + inside_table = False chunk_id = 1 # Initializing chunk_id for line in lines: - if line.startswith(('## ', ' ### ')): # Assuming these denote headings + # Replace escaped characters for table markers + line = line.replace(r"\begin{table}", "\\begin{table}").replace(r"\end{table}", "\\end{table}") + if line.strip() == "\\begin{table}": + inside_table = True + continue # Skip this line + elif line.strip() == "\\end{table}": + inside_table = False + # Save table content as a separate document + if table_content: + metadata = text.metadata.copy() + metadata['content_type'] = 'table' + metadata['chunk_id'] = f"${chunk_id}" + chunks.append(Document(page_content='\n'.join(table_content), metadata=metadata)) + table_content = [] # Reset for the next table + continue # Skip this line + + if inside_table: + table_content.append(line) + elif line.startswith(('## ', ' ### ')): # Assuming these denote headings # Save the current chunk if it exists if current_chunk_content: metadata = text.metadata.copy() metadata['heading_hierarchy'] = extract_headings('\n'.join(current_chunk_content)) metadata['chunk_id'] = f"${chunk_id}" - chunk_id += 1 + chunk_id += 1 # Increment chunk_id for the next chunk chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) current_chunk_content = [] # Reset for the next chunk - current_chunk_content.append(line) + if not inside_table: + current_chunk_content.append(line) # Save the last chunk if it exists if current_chunk_content: @@ -152,4 +173,3 @@ def split_text(self, text: Document) -> List[Document]: chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) return chunks - diff --git a/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/whl/llm_bot_dep-0.1.0-py3-none-any.whl deleted file mode 100644 index 6072a808c9e103090f6571cd9235b92b5f6372ca..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16003 zcmai*bC6}rwy(>!ZQHhO+qP}nw(aV&yKJk=c6HhH`n(r$_P+a``|ga$HP`whzsSs( zYm7PYQ;-G*K>+{&fB5KfnHzV($3OFU!Tsw z6Bt0@-_t}r_0-EjoqkUP0|0!$0svtAW16A8v%af~rH%8SdBbYb_M03Cy%*{T=Y(to z?cv8J{o!n~jetS44U}`j2na1KO=}WGX(dVRVc*{2@I6Kw7c@{Q;E_4c?94ck>aht~ zHKAzuMAvZqDHX*!QBkM1Ll>~WAGGUS4l$6vYy~Yv*PA;{YU#K#<9BZ8`1jW%Bf+dd zWy(aGuSj}{u4WdH?&8_+(^sZbE3740#dmZGO4n@W{7tqA#;b^F*b3}vZP}iuLWPic z1UWBE?7qJ4T=7IJLtw0?H7n-=hCAAUOXp^7>|o{-L9&t?1l0tV7%ir$A`n_Zd(MB) zh^e8`$STJ24YCm$cLMQPin|{pIRTy%5l(azYVip5U|ncL4Ja~|ij8+saYoS)2T=#- zC`?$_QRY%o+=t8&KC;kimkKV(sKl&F7q#SYi#G3bTtBm00QKDEGu(+NT|~7^JPbzo z>6*vxw!&pG8PaqgTa~b0kWNv7sNugpcR@GzXyifQP|u~%glg&-bX6R^KRX7|on#0v z(pGqTz8Zcf$lw-GJhtAcW<#|=RVa}*^^K#Ib(1IX>@1*^z}_hkTzRK;zSYFFJg7`B z`yJ`=ns2WnUCgo=2r_&8er$PHF70aK3R#vz$uBOU52Cv1CN6?@&?-EqO&Ds-N0;+3 z166Z4gZ}58lPCfB}A}{Lr<;-qt9Cg6Dlke2Xjf#kNGg z4>psjspqv09@17dp8V_0RP}F@HGeAQulj?`iy*ZE%@G!8m)g*IcZbN}y>1(>d%o0b z^9dP5_b(s!Y>$1d2)aEj4|uXiU5P%*qmk)(xj5>Nd!cLJ1nU*Mc4)>gFv}_a>)O6g zK2GUc;E-^QgFe;n9q>oI!Q<^c7NmGGW51aC z56+X^Oc?vN9RQXXJYnuQaZ8BbM6pP7ek0_8VwN2{f+D#95YYrUv^)>PbAP~NO`)4Z zcNEx$Al_48A8r#JnrmeCx-?X-9TZnuqD*n~p!w8}(X7i}Bz)XSt3w{mt|?OFvj%FV zLk7bmMnU;t69JeGeu4?X%?M2s#c`cz?!>@@h0OU*V?^0rq!`mF5Wy1*7X1t4GSS3t zBna^;g~Ld2xt8{D+k6hWl@1z_(!k}f=G#U&os>~o9Dq< zdA{8xS{1*P=VRfFqAS4^z0ULn zKEB%@b3*N9DXXSJVB~%cph_5gWQ-?*>?T%?6u|-lRQUtS5Kdl>7$ciRQ-T^<;V_^B zb2XptEAp{ZR4^<&V^_$e?UNr#Z7p&G;ycN`~satH0gD=?^%LX4eIj66g9 z!(uf-{&5R#g|#Mnv>#Sv3K60%fsJUq@aC5E&WYb~`>V z=x_A8vPI8!@qrvzelvm&i01m#p!)r?$5cw)Aee>AZxrXX7jhaZXQ4DxAAp7 zEnTiA=VPnc`Fp%>uS?kyncKE|9bN{Si~W6gJDsj>U+cyFz3+GF^<(rrZ6A*M#Ogon z9?r?x`@DYNmY!^we7`Mg&-(Gb&5kUUO7FWfFnR;_CD|+&-C7dOnP*qvS7~ zfMI>MMla~Ql_UfnhxlKxH>c1f0=}1dw5LuU_%LkjaBj|EF}?d4T{8SU$dZEH|rKM)+|(JHA{^W@kbVZ zw0mN?Wr=>OGLy$WmXT*Qxr5H)rmfs1!mmcTPa1Sa1AT* zc=&dtg^~)wZk$2?u(&2Ak|eUjN2l2Fbo3Ys?71n&93?Zv&bsX_6SO4dgUWK< z`EpaNI+lw{4&yRybj>6t2#Vub)KS+9juLu~LPeOk>1L8y7ahU$Bqexx(h*wj^ahrM z@z~LA2*%%c~Mn8&GkyqtK z`k#jRmPk8vauEse4Y$H($Q|32Zjqw=~o@V0m;gRib)B-|D-7Tgm zrBw{WL+$!|Y2`5!rnvgB^5c=&12c_T53kQE>1WqMB- zqFPvUM~<1YND3)3>|21r+ND+&RCEF{$pJ``NBrogs8cA1BAWNd{m}1H_)+Y)DODZ! zVVS+*5_5vppl)%HHIF#5bn+t%93ayc`h@o9^-`NQ82?t7a9HmoiA5bmO3e`u#I8Zc zlpP-`t9ESHjI{iR5uTiO^jp3VnDI#CTnT9h5j8LT^_&_%L8F z!=Q@J)~C*}u5v# zAqQrbGyG&mI@yGsf$OK|Bq)QMb{Q9iY!_^VA^_K9FIRLuh?N8?HY`G2uN8c~a+zD| zlFGyugF33p5r5n-hzp5MNOvuDTvDiSjf++aWNPW?Zn|P|oTU4I{D6j+D!o8@)?RuB zc1#eCxXJ5U;Av`N^UZe-syVV`PlIgzDdWBc56ZOl zp>U-@`09NYX-C{`5wk%eR{}nP1{qRG7a993p+dY~yg}p`8tJHX5PY+Sn)#{lsI}G& zO-onmd%5eHtW!KK66|t!IG$B(%p17}eo8!s*RnBJi~^@C%ziKiV!@xypk$`}xhR%F~lh7>Z)Yo%_< zW|k?2QB)&KXn>MV*sZ`i_5;Bf=pvz;A_NK(fgk;(s9ou|h0srLN?NFl>em$jm-9hK zz?>fi%<;r@RGGYr^db?iHGWBkWLYx|$eLG@Os)5FzZoQr4DM%aVnJYAxkt@!{0o@$ zwK8&%$J``8B0?r0tL;>K+nhpZZKcDwGW9iAj?d=_e&(&5W@}Wln$aox#ccnUfI0IB z<_Mr_;OlC%tK8DQ)$s5x4k8;(`MF@AZ45E3Qp2^()D@mTPx1AR8H-1FV9xUPSv;@G zvksc$V_^=vxLWB`!C;BIkAevo9VSo;wyT07E+E^e0Ld~b4?whbgwP-f0AAJoRMlmY zT7QJvw4i>_xw3WUrAu##UPU!kMoyxDJU$a*>^1rmjz#$uFhJ%NDFZTs+zwvx9cwvG zM03N7toMy>rNVg-#WQ3ao{B1;Cm?}fFdDo2-DM<@RIqK=qt8Gok%c}b1deB& zuxFD~^;Qy@^H2x@WvN?N<5C>ZCNxoBv)LcA7cH^1@uW1E<4jImQXgIn=)bg`Y+_93 z^hc}oDGy|=LU;1V7Nr>xciRNX%nea{w31nJn+c}aQZgh~fd(U2MUCALmiK8htkZ)- zq2xAUCgoJ~e}~4R=rlK+F<)J69C?yLqZcDvwoAq36NEN~BaU}=yeLJIcxL^~VFmgq z6Y#4N;lefQxJFE7xk5##QeY%MQ@g_8@^67wn zFwV2`DdxzSQnl&Q_JuF)J?FR+Ravg(YFIi0cbrjFU0zjP;wWl!CCPIsR#Q`_hJTL# zYI3+6#@pD-?)#O^LzVhvROzLubmdbe-G=hxZCqPLH}Hc1?Gyj6u!>z%Q{8nR2|FJY z06+{L0094=VU>rAlcBMTrM=z1Bdb_-Tl-Bmgs(b%K?-a#*dH6dth<1k*+zg#Y-4Yx z62PHpa%}37C=yk&YKMI{*A?#EV;=)-!`{Bv*My~2V%X$TJv-H2_!NCDS9OeBz}CTo zr&25nxKgdiTTifbpJCos3bp=UdZu=ybt%I)^0CUDX{NiN(waBT!78pv7d5p7ZkCm% za3f^0dJkY<321iP{_Xp@$Wbenz`>ydwry}hbg?GnwG-QCoeJ+2A@&M(@41ul_=Wg^{2&yd6jN2eU{Dv5L&^{DCa)Z`rSl?%F z*cjpj^L~X=4TF}BzMp)M8qfEF>vDlYtWNSs;0W-n-hK!X-HFGMOIQpRt^2htJkNrk{5B9LRztZ&Ul`cw#-CE>5S&o2>`xPBa5;zZGpO_Xcjh&Io+A16?eKCU@Kk5=8raod^Jv) zS;X|ge{qrDZApT5Sg*zQRDhcgcELJ5*2x$Fb-(GQ`#7_j>XH4C)~$O}k-3tgy}Zj8 zbzUXd4-T(cxqx?F+Ya5+=O79h%Nnmvl*26?>F_XG9n~vd{~@!;{`RyV{C>;l;rWBd z0At#tm#nNd-rN+4dpP@I#g?3lJ)Ir>11E;l?>b?jEngNo8j>T9WE?KRR~9y%cac*> znsVbb)E8$U37PC`hf#j}1$8kXL9HJrQXLw8&sq)3O^wxkl)_o z3ei(9?u&+$tco&Fyd0Oz@NyGN45?r!lrn%k@0!#7bL+s5?L)*UEq)a)D-JyF#D}$q zkfKaDl03mx2z_W+>~xDifs-3kwU;-$qNVjSv7Mx7*}#E?Delo} zUp)tj&LKoA23%M-_yh#{y>0j6vqbwKJG(AU z1rf}^rnonTBV2-wo=lj$e`!l)rh0}XbHwO3nEB0ULFf?3va^!vt4|KbV`QQTyMlP3 zATYOcB@);R4;(6+>6gp6xezFF(;e^j9JAGwVLx;JwzZeqnPBYnc76&uGpw zl@n4DGj1mWzwz+^rq;wQRQ^z%X>^gLY%uExuIRJDHxStmZNxImP?m(UJur6PE)UaA zvRhj)?!X@7d1&1Q>jO=6+P+BJ`0X7(&QALeBzG>{OFdS@AMxPMe4eW4Sm0psYq2lu zMx2;q-+#-RCpj3Z7v1!{2q6Igxbpx2VE@Cm+t?eLm^%I2t}kbxXkcqcu)ym}9_^jl$+uX?5a?*t+5kOn(wSZWPJAT}hcR3PzBwm@E@X(k~5C)1B z&$GL*6C~C&bsKO#hj>LdS5qhIWy)*Hse7^=D0xNmNUu>*r8QPd77v+hA&m6}3u^;T@eD*rlfK;`M8mSld!7>AsV^K2!JI(N;hvR>Z z6)&GdiqLw(7%af#-3QG^jSKzs0s1v|+%I@$+X$8gbeu7!N)uJ2HSL(9KijiZ4e@X=dN#LRc4+v2*KT;|1sM{ah&SATYXCA>}a~O>6~C zmdqmcVzO}G9fCg3LscX>kgSS>iUan`69&&etrXM=R3+@4__)4^rxOT{?z0z21IRVN zqY0&e{?(=ftfI?%3(_Q&Gyi9S_MDGnWw!YwQ%_z5C(*|q_zn=TwW4|BI*Z=@qix!3 zDYs6VB+`*JtlN!eGe=U==ZNXRNyYR4&eww&2=56rTPBH+REZOk1Rnhx%-p%3Ps0^Z z7M3yBBn=a^P4L#E zW^0@{hU%Q1h?*3%HJ~P>T=7XGs~AucxY`5rED2A-wE$T9+|o>O%+U;3ZC)uqWAK4( zQ`m^n$1|`UhOO;q8FEi-LMZJL(q}4$f+&e9;OtD}Jg_BPy(7&IsIVSA*qOP#!OKUS z2U#i_&>yXpRjoJCh>A|@u)d5KdU^OrWv(QUIm!q{x``eB_J_uEovyQ|{zz(Sm(!?MYkL4+5Um*Lr z$EOzt7$9r`tc8A|TKi9;n_&Ck3C|~^;uD#8LNTdyo^()^1boXD-y&$>n`pOSM(ITx zaKsFLS#a$9zeC-VijNtxDa%Xc0g9oD4AN=R)_>V$DaXw#PBAJ4qvc?-O;^M~v#cq@ zH6`wc^)YTL(pn8h>Y?vum6;CYP7i?LnJIIM+Xw705HFZ$)KL1V-l3 z&v^#HZa8cNQGk?!Y}lvskgAw125Pp@$P*Ll0>M)f_++1G@-gAnlB!*8iBLgD0P4KS zNi!se*kkZd5@Z4K(GkV)1`2#(u0_bzwEkQON+Z3E)Krwk(=oV!#!B))Uq0zWm-tS!4k%{TW^UBXbu zP9+ZCx-K}{{z%RU&chBn$CSt|Y&D!Ayx)ZQ%1pIED@6mYV6MGdl&Zn9;17NL-7`@- zj#PvCq%>(iX*s#q_|>K^DPCwh36|WChy4*GR0X+Po78!!2B*>LCb&ee?n8zK)Uw73 ziyd8P9~sr4CCnQ&G}_2Xbf#FmxLB;G9<>E?AEDpCAhZGlJHRaMfMQ`s1zktK|abO)bbPlPb0iB)^ZL``vq3o|DHh9EEn!nrWe|MQ1ULc^8=#x-17g$%HUc2kfa zz%&vBW%ZUbT6Cl#`%jJDtl67`Agavgw*-o1GGK_ltoUA}5AA6wTgN>1c-+QvaPr$q+_U#?++|Y-_rD-XQb1cMAu08;D{#@2 z9Pn5QEiRV(qu}b#kaJ#l5{h{g^$)=`NHmGZ7&4h}+>oL5JA$(CMW5r+>L7)b)^Xvf z2XDk*YB`*CSUxwB#uPj7cuw;e9=xS$J&!;f+zHVI(Be_^pq0rwYB2e^7$v(Z&0?Q` zZMko{3L^;IF&L+A7`0i(G}%7U^HJ{@m2DM3GBFLEk{Tln4`JmB0<{{|MX#cEJ$Zqb zpsmerBl0GE~m)NBz{@)#^~K1KQv!6WC#Uiu}B(}F8`zA~c93vDU@ZyH%dhUnL0 zFG1nJwp!SJ-ySd3)LdU9)^1->(Vwy2<~x-ZUervbavG`RUOdbs=IoV8y|B}PJ=3^z zq07P63ML&~HJ)FPT90h4};*sSPf{b&?P=yDwAawLBV-@w+h`Urbz!Gd-)u7<+Ca(YeYTwBwtx z|MM8UZ#Bp@&`Tb$lb~REQ0WP{H=Xyypn1}_!3@|jlF3YshJ&6_N~YSYs#+BktsK;lIxYNk-Q10Q>>+B3-uX@DUh7nlkM;1{iO zvbqleJmgd{lWYh-N1}L4lOUsHtcrR3zJY%tpb2oJ!Dgoz! zm$e5)O_R6XMl7ajrkb+eXrr?LEn3V%-1R1>o{77XnC{#t?a5kkJy)@P0+cpJ_vJXp zkm$iBc_Lh5N*363Fhf_A~XkVBx<5F_~xa?~;KRc@;)clb$vT?MzM$-KLn ze$qWX=zIq?vf#d4Fk$WX*DHP!4Fq?v>Zx0Zz)48TSp}USv-Utf!1z8L#Q&rcc11-& zthYF|J33n`%t8}!EmNCb;(*%-asbQ$2|)%b0&%=y52jm$r2qu@ViVNL#FLCGS zlB_CM8G)A56jT(AeXJIi?_=MOO4(y%ZYSV~CZ}W$H^Blp{pWR9C$IuxK+ISet~f?q zrvSp$OHJ}lG+lrTcs78}CBR?PU^%c1-^NcDRUuI|nat%uVF z@zKEf_DGS!+r&ne^REdOSxbBbdDp_{H(F!J<~14?YI0J5-%G3JTDUE=IwJ>Uv7I>W z`k!K!I?;)$w7WCv*CY6r;=jyK`}UN7xo2UwlXez`WUWS(aCt7CJ!Y5a07OZ*HZNMY zmHSQIWSv-ktHTZ(U9&1);#=d7q+u}_QymyoiMJb zhw~GKRK1i8XS^T-<6NsKyf<&E%GP5y1fES8#03NGosFIn-HoTPM};vwS@c=mHNK|b zm8bWu!YCj2IANYNh{;zF@SU$m#R({P8F3ZOOx;S}GZ0=@R_@8$)E&Uq_KruCB!=uW z5+11YgnhA39fjD@vtLj-ikP&?UNfgIQhJTnitiLy~Hc6{d zM?u`sz21QEQXhz%&8Sy zXYlzJxKZP-2QmC86|7}M07I_$-G(O3t|#MC5jU_cTk8GJ<<<$^qp(O?chQJTAg6OO7ypY z@(755p%C2ZQ{`7&p<~*uT%*a9$R`y0%q#MLDVD;+=dJx%D))e@HjQ;s_fQtkH_wR9 z6%gWCWe0xDopih?JDx$Hf4zLrL8IDL*uuZnN;#UR6|{1yg81DRtiNtVXa=NF7TFCN zAMD<6?D4NM7=vzdYd_id+>HuKr~Q(JZ>Fu1iB&&|^eC<^Z){I7w5!hZKJtsU2USg( zk+8n{rEU3Um-?d~+qk=XE$tKCBwrc6a?rX_pBSfK5BfPe;qCW#RLOdhBUT{Y$tsQU z!`{PC9NU3RFu(}CY`_6c+-fi31-5EsD-5+=H!Zh9dJ8Jm)>leC8?V#R$_p76dYVU< z^VE~x^-31AL#boS;3Dm!V!qYK7)>DYj8&QA<3{}~OP%l8KpJ+4YM=JZ3dDjAt51q8 zaEsq)Bj0i<>`Z?qK2q$OBC3+2UlorGzEC{~#PF8=!~ozfE?#Fk6fl0o4;1E>m~?=$#quxh;@cWp!PMzTJChX zT~>W=fLqmz(%7+IFj|)%pgVq^ao*>JJIDA_W1f*c0A?0kFG$T76!LD8EIez?HxMR#tHN)92amQwf<20s!y@ z{ongMoo)Y}c8XQg{!>ms@msCuP*Bu3M=ABjV8Cz)EIgfqu>_~_W`+q&i!)(Es*F_W zUb^+)A*_TxD$xw-M;v#4opGmA!3Y~LuhG6NxD>7HK!K847p!lu*o|XFWxLFaIVW;3;5Ad^zjo&*>qrQ3~ zKcHmurh!6IDqh0HzKwCjXI5NV0}Tw6GPD<)vJ8D& zxfXT2HQU%?9(G=!^+ZLqhh5BG-i)aHrGQX0;NGSCe?_yCR!y zJ>nR)$MSJi{(434+Fc(asT+k;4i+ISnQ)(LYSwg?IlwFXZ@pyugwjo>5-t8Sp(a78xET5RktOuEs9B499r9L#mQ}qf^`^RlNybWCX*%A1Yj+*hzg- zfsm_Z3yUb@$<=rus)lpYb1tv&Z7J7Nynla(cHaC11#J`l#as)Spv|qDa0HGmbWU#U zb))de8rl|kWw$Y9mL#KI6hQ;d_@^KN&-Ws(LHz)CBE=+X#KpqOzchCc2sj#PbXoz+ z;Fa&Of`l*B4U>k@#=-^5q+DvrCYqZm=7{#gXL^r(XM(P`#b{qnclA#aQ**3fngz8) zV?Uf4$}$P73<$k6W}+dX{zg3F}!?)VOSovStL zp|LLTXE^N}Nw*eJi8kYa`N5VxeR$WFK~N1{%9=yxHUs&AuUnoDIwMUdf4|=y6>boU z^TG)ABv$&oxk=9q&9G5Y9|9ap{l?6+g5SdkD;`pOd11sLsb1Q34!evBby;y`*@441 zknH5Qyb3o}yBjWbPVb!PA!>tx(P}1_wEnOdqIcO&U%0!T_X7V{dDPm@P(8v#EVK^{ z0H6vL008$NDRgHC8%r0Lza`OoG`9X#9QA+JcWeOHu);ANF9t-X*nopxaTQ(U?Fbx_ zXqnb50!geoK>gftcSm_NS4$f_J_7k#YCp0o7pmWHRp64kTidNH5lxk}<(*`IeE$Na zzPTJkscafsLk0)#KPir>UZ|kId(=uH8o8I3@KIfq%Q(hms`@F_+4kkhfkEN~87*T& z5{2;FPO(C$OloqNiQc->!fm&$+`tBN@J#xyAH@J>T1U#1#I522&LVRUQ5^xyInGXbZ0ZFSGGq!R%3Grz;E6%Cb=F@6 zPt#=94#=?BafJ0tNZqjv*#mP1BqednJ54x0q8u$Ughc?ON}j@cmaA1V9}5Vt7y>f+ z;RcOQ5+#t_eCCiZEKIEqAMom0$vPym0A$A+EmOzVeJT>iLBiB()Y?1UKBO$GmE195 z!5pb#0-taAK39gt=X@b^;fD5ZCxU?>G8Hhj7IXz{*&Djau>!woXMn5p3Zk)Fq5vBS>BBe?!aMoH~ zCSGBTuCP?AfUaPQ#U$wqW0m2DVZmIi*b}GOm68$Q?HE@jCcT(OL&W_I>PP0Fm}{b?QIS7+aoU2W zEQWiGxvvS#n~My0?yh?nc`)J*(pRyZlY-PTh?Q}dV-hABCP=1_ipto)0r{oIr6(Ns zW_s9&grG`h1zX$|W_umVM4+m*H1h^$k-K_`i))TF+;wLK^laXAvpsi0s@`=%Rz{u5 z#|%yfg+Ip?6H=kHltVcJW>=#3b1ef++p~*d9oEE?NEt}U)C6gO9>Wn(+OWd4ppB-P z%87j>vY5zap2H?nG)_o#bpi zfsRG4cD}Ya9CbhP#`W{j?su-CtQ1ICIf}s1073MhzGg8L`+I$ppWcxKG8P|DNFSBT z{eoo5VFB@s4rTU-bU9DI*hg!vmpX@zDwJeS$VgQ3kE@5ZqChgU38WD;Li9F|ceKSp z@*q~A$OoA8sxuaYu}5!;ev!PQeV)~Eul3EeDnb?sl|ax=pEWP4{UUq|2`!xCo>O&` zR<)G8m%Ie$8uXzLo0wiSX86Ati+?W|=1IUOgc!i}XV3ADc?u|ueC`L@4duI)z zPBdf>?rzfP)ArN%_ObM(U(gC#l+ zGX`D)s7_@u7Y!sG1j%)M}LSHKylC;ods7P~BHS z%nUV%pFEx}=!D=ty=$>xv2?vFT=mSB*Ni_pp1+>GKIW*lhZ7))61pP&a<2~a&|UG3 zg8?J{wUIWVsVz0yOo5 zF~$NhOvg+QQ`yw@xxax730ihj3{J^B#P=vV+Rrw5F&1Rsgyd!}HN5TiwgMbZITfQ)F1b#a+-g+6`uVF3LGL92Nj>2qSzhhxQae8p{=Su{`^U-a2z2Cv+ z?dddGH~HwfnehB!Rk-oo@YW?`EqQDhb6@*t(mSH0;8%o(LvhvQeHYi%C;U?}Pd8xQ zylXED_O5}h2LkI(cv^YEA$ak@S;dQE<2L?6Zppyz-`?O=h&9!DvC3j5{&+k}KmY)& ze|SDuJ7*UsS7R4fC(}QoE8Dn6D_};LFn9h?3CPC6jHyr=+CTvd=u%}0u~yU3-Q?IS zYTu7-T{>-YNZ-Da*;O6^A-Jcg#MVhSoQN(-Ox$HKM~V)&m&26{i+7Fjo<5veXjEz$ z&X6&|h}QX?wIbh{OXGw{KMLII!v`-PW$8F7oa<~Z1%7&FOPB+J5eh;)G8;xgM?4=H zki>zwRynU-=PXdWw8cXI&%-){!nXm94OVwv+zraThJ!n=xYG^^QZ-WCb@Pt1hs}J0 zO}2Q3a{70A(yzaj&(Hpp&tcsVV?zGy{O->n`L8EH%Rt9S$3SOd>Fh#lX=i3nFDt4d zC?cpLs5{GzG{^uWI=@{#26iYJK*dUng7Pqk))@X0Oaec2?I^l;H{CJal~5!u+Kwzu zw zmkXeX)1*W}NhNy(B`MRUL{Y)AGCesfy(~LbeYg+wj{}BFA998J zvm}^5gYth_nv1=IzKyAysSTZrhs)&nc&z*ceWbi3?e0IJEV&|n0)Hfi3xDK-ME?s^ zNmN)~NyK+vGO0)aAau_+AC$(-a30#Ei4xEez%+RvL57S(LJ6Y$MpfNnO%eTNKmHlP zN+1Yw%-|VNU|9_$K$Trt7}L4vL0jV@Y-jn-%{j)>;cYRb#lqU*VQ+6GqHb~B(B$`# zBg-txJv6T&BVNA*6$_7LW~)UoK(3((^Up0Y+6Ul1*`<)-?5&|9M1Q2CSEf*pA`eAB zP($qzTd18mxyp=8aX<~6>aF#d5UbP8ISX|-Z@E7 zZT1d0K4B6t;@xvS-S$Zp;d=;;EcKr8(Hndkf>pL&bh;V}-GjX=L^$TRUuJLVqh z!cgWfq!sIM+0OiFdSW|SRI^tE5Iw}U;1w9nST?t$pOO0^*5NRV{c8C~VuvH&X)Ma(iuw)JC+%b+P!gtPA zm?POLOEV2qUYIAEsa-~Cnl1~RR!#p7H_lPf<*nS2?-<2t##-|t)V_R<_R&=r;(fNV zar5>J5aM}Vg}}Z4kZ}ZhF2=QkHGb|H0^ts~wGZ+`DOlINA@t^Yd`Ho?>!p>3S1}?k z(OV|TQ1Wwwyaou;N2nbO{<~~Jvyr5T*1_VL@{+~6je;&gBd$k1q>-Ik=UjGjHPm0(7hX4R*31t5Vkf8k&`{{OiDV#xoA{ipH% iHx|(7f5HAgQ?4Kl`X^Zbmq&pCi2maM_PPG`>;C}8GEi^; From a00a7d91ebe28eb4fb627548c084be154aa9ce1b Mon Sep 17 00:00:00 2001 From: Ning Date: Thu, 2 Nov 2023 18:49:41 +0800 Subject: [PATCH 11/21] feat: invoke csv load in glue script --- src/scripts/glue-job-script.py | 71 +++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index c0a3a294..4e0a6bcf 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -5,10 +5,12 @@ import logging import json import itertools +import uuid +from datetime import datetime from typing import Generator, Any, Dict, Iterable, List, Optional, Tuple from bs4 import BeautifulSoup -from langchain.document_loaders import PDFMinerPDFasHTMLLoader +from langchain.document_loaders import PDFMinerPDFasHTMLLoader, CSVLoader from langchain.docstore.document import Document from langchain.vectorstores import OpenSearchVectorSearch from opensearchpy import RequestsHttpConnection @@ -150,8 +152,8 @@ def parse_pdf_to_json(file_content): return res def pre_process_text(text: str): - # Remove special characters, punctuation, line breaks and multiple spaces with a single space, - str_doc = re.sub(r'[^\w\s]', '', str_doc) + # Remove special characters, punctuation, line breaks and multiple spaces with a single space, + str_doc = re.sub(r'[^\w\s]', '', text) str_doc = re.sub(r'\s+', ' ', str_doc) str_doc = re.sub(r'\n', ' ', str_doc) return str_doc.strip() @@ -205,10 +207,29 @@ def post_process_pdf(pdf: str): logger.info("Post-processing PDF with result %s", documents) return documents + def process_text(text: str): logger.info("Processing text file...") text = pre_process_text(text) + +def process_csv(csv_content: str, **kwargs): + now = datetime.now() + timestamp_str = now.strftime("%Y%m%d%H%M%S") + random_uuid = str(uuid.uuid4())[:8] + bucket_name = kwargs['bucket'] + key = kwargs['key'] + row_count = kwargs['csv_row_count'] + local_path = f'/tmp/csv-{timestamp_str}-{random_uuid}.csv' + + s3.download_file(bucket_name, key, local_path) + logger.info(f"CSV file downloaded to {local_path}") + loader = loader_utils.CustomCSVLoader(file_path=local_path, row_count=row_count) + data = loader.load() + + return data + + def process_html(htmlstr: str): logger.info("Processing HTML file...") # filter out DOCTYPE @@ -309,6 +330,10 @@ def cb_process_object(file_type: str, file_content, **kwargs): res = None if file_type == 'text': process_text(file_content, **kwargs) + elif file_type == 'csv': + res = process_csv(file_content, **kwargs) + # CSV page document has been splited into chunk, no more spliting is needed + split_chunk(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index', gen_chunk=False) elif file_type == 'html': process_html(file_content, **kwargs) elif file_type == 'pdf': @@ -334,8 +359,12 @@ def iterate_s3_files(bucket: str, prefix: str) -> Generator: # assemble bucket and key as args for the callback function kwargs = {'bucket': bucket, 'key': key} - if file_type in ['txt', 'csv']: + if file_type in ['txt']: yield 'text', file_content.decode('utf-8'), kwargs + elif file_type in ['csv']: + # Update row count here, the default row count is 1 + kwargs['csv_row_count'] = 1 + yield 'csv', file_content.decode('utf-8'), kwargs elif file_type in ['html']: yield 'html', file_content.decode('utf-8'), kwargs elif file_type in ['pdf']: @@ -345,28 +374,32 @@ def iterate_s3_files(bucket: str, prefix: str) -> Generator: else: logger.info(f"Unknown file type: {file_type}") -def batch_generator(generator, batch_size): +def batch_generator(generator, batch_size: int): + iterator = iter(generator) while True: - batch = list(itertools.islice(generator, batch_size)) + batch = list(itertools.islice(iterator, batch_size)) if not batch: break yield batch -def split_chunk(content: List[Document], embeddingModelEndpoint: str, aosEndpoint: str, index_name: str, chunk_size: int = 1000) -> List[Document]: +def chunk_generator(content: List[Document], chunk_size: int = 1000): + # iterate documents list and split per document with chunk size + for i in range(0, len(content)): + # TODO, split the document into chunks, will be deprecated and replaced by the ASK model directly + chunks = [content[i].page_content[j:j+chunk_size] for j in range(0, len(content[i].page_content), chunk_size)] + # create a new document for each chunk + for chunk in chunks: + metadata = content[i].metadata + doc = Document(page_content=chunk, metadata=metadata) + yield doc + +def split_chunk(content: List[Document], embeddingModelEndpoint: str, aosEndpoint: str, index_name: str, chunk_size: int = 1000, gen_chunk: bool = True) -> List[Document]: embeddings = sm_utils.create_sagemaker_embeddings_from_js_model(embeddingModelEndpoint, region) - def chunk_generator(content: List[Document], chunk_size: int = 1000): - # iterate documents list and split per document with chunk size - for i in range(0, len(content)): - # TODO, split the document into chunks, will be deprecated and replaced by the ASK model directly - chunks = [content[i].page_content[j:j+chunk_size] for j in range(0, len(content[i].page_content), chunk_size)] - # create a new document for each chunk - for chunk in chunks: - metadata = content[i].metadata - doc = Document(page_content=chunk, metadata=metadata) - yield doc - - generator = chunk_generator(content, ) + if gen_chunk: + generator = chunk_generator(content) + else: + generator = content batches = batch_generator(generator, batch_size=10) for batch in batches: if len(batch) == 0: From 95adbc81315bb974934cb50f708bcde097bba4e7 Mon Sep 17 00:00:00 2001 From: Ning Date: Thu, 2 Nov 2023 18:58:41 +0800 Subject: [PATCH 12/21] chore: update glue job timeout and format the code --- src/etl-stack.ts | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index f4dcce02..0ca65349 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -36,7 +36,7 @@ export class EtlStack extends NestedStack { type: glue.ConnectionType.NETWORK, subnet: props._subnets[0], securityGroups: [props._securityGroups], - }); + }); const _S3Bucket = new s3.Bucket(this, 'llm-bot-glue-lib', { bucketName: `llm-bot-glue-lib-${Aws.ACCOUNT_ID}-${Aws.REGION}`, @@ -49,21 +49,21 @@ export class EtlStack extends NestedStack { // destinationKeyPrefix: 'llm_bot_dep-0.1.0-py3-none-any.whl', }); - // Creata glue job to process files speicified in s3 bucket and prefix + // Create a glue job to process files specified in s3 bucket and prefix const glueJob = new glue.Job(this, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V1_0, - pythonVersion: glue.PythonVersion.THREE_NINE, - script: glue.Code.fromAsset(path.join(__dirname, 'scripts/glue-job-script.py')), - // s3 location of the python script - // extraPythonFiles: [glue.Code.fromAsset(path.join(__dirname, 'scripts/llm_bot_dep-0.1.0-py3-none-any.whl'))], - // extraPythonFiles: [extraPythonFiles], + glueVersion: glue.GlueVersion.V1_0, + pythonVersion: glue.PythonVersion.THREE_NINE, + script: glue.Code.fromAsset(path.join(__dirname, 'scripts/glue-job-script.py')), + // s3 location of the python script + // extraPythonFiles: [glue.Code.fromAsset(path.join(__dirname, 'scripts/llm_bot_dep-0.1.0-py3-none-any.whl'))], + // extraPythonFiles: [extraPythonFiles], }), - maxConcurrentRuns:200, - maxRetries:3, - connections:[connection], - maxCapacity:1, - defaultArguments:{ + maxConcurrentRuns: 200, + maxRetries: 3, + connections: [connection], + maxCapacity: 1, + defaultArguments: { '--S3_BUCKET.$': sfn.JsonPath.stringAt('$.s3Bucket'), '--S3_PREFIX.$': sfn.JsonPath.stringAt('$.s3Prefix'), '--AOS_ENDPOINT': props._domainEndpoint, @@ -73,7 +73,7 @@ export class EtlStack extends NestedStack { '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat==0.3.3,openai==0.28.1', '--extra-py-files': _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl'), } - }); + }); glueJob.role.addToPrincipalPolicy( new iam.PolicyStatement({ @@ -145,7 +145,7 @@ export class EtlStack extends NestedStack { const sfnStateMachine = new sfn.StateMachine(this, 'ETLState', { definitionBody: sfn.DefinitionBody.fromChainable(sfnDefinition), stateMachineType: sfn.StateMachineType.STANDARD, - timeout: Duration.minutes(30), + timeout: Duration.minutes(60), }); // Export the Step function to be used in API Gateway From 728cf216c6c2ad0aed884bf2716b66023cef2a28 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Thu, 2 Nov 2023 15:01:32 +0000 Subject: [PATCH 13/21] feat: 1.update pdf process in glue; 2.update aos api schema and backend utils --- src/etl-stack.ts | 8 +- src/lambda/embedding/main.py | 29 ++++--- src/lambda/embedding/utils/aos_utils.py | 10 ++- src/scripts/dep/README.md | 1 + .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 0 -> 16522 bytes .../dist/nougat_ocr-0.1.17-py3-none-any.whl | Bin 0 -> 82497 bytes src/scripts/dep/llm_bot_dep/loader_utils.py | 26 +++++- src/scripts/dep/setup.py | 1 + src/scripts/glue-job-script.py | 74 +++++++++++++----- 9 files changed, 112 insertions(+), 37 deletions(-) create mode 100644 src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl create mode 100644 src/scripts/dep/dist/nougat_ocr-0.1.17-py3-none-any.whl diff --git a/src/etl-stack.ts b/src/etl-stack.ts index f4dcce02..5a01ca43 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -49,6 +49,9 @@ export class EtlStack extends NestedStack { // destinationKeyPrefix: 'llm_bot_dep-0.1.0-py3-none-any.whl', }); + // Assemble the extra python files list using _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl') and _S3Bucket.s3UrlForObject('nougat_ocr-0.1.17-py3-none-any.whl') and convert to string + const extraPythonFilesList = [_S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl'), _S3Bucket.s3UrlForObject('nougat_ocr-0.1.17-py3-none-any.whl')].join(','); + // Creata glue job to process files speicified in s3 bucket and prefix const glueJob = new glue.Job(this, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ @@ -70,8 +73,9 @@ export class EtlStack extends NestedStack { '--REGION': props._region, '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--DOC_INDEX_TABLE': 'chatbot-index', - '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat==0.3.3,openai==0.28.1', - '--extra-py-files': _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl'), + '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat-ocr==0.1.17,openai==0.28.1', + // add multiple extra python files + '--extra-py-files': extraPythonFilesList } }); diff --git a/src/lambda/embedding/main.py b/src/lambda/embedding/main.py index c6ac91f7..300a9a63 100644 --- a/src/lambda/embedding/main.py +++ b/src/lambda/embedding/main.py @@ -124,19 +124,30 @@ def lambda_handler(event, context): # parse arguments from event index_name = json.loads(event['body'])['aos_index'] - + operation = json.loads(event['body'])['operation'] + body = json.loads(event['body'])['body'] + aos_client = OpenSearchClient(_opensearch_cluster_domain) # re-route GET request to seperate processing branch if event['httpMethod'] == 'GET': - query = json.loads(event['body'])['query'] - aos_client = OpenSearchClient(_opensearch_cluster_domain) - # check if the operation is query of search for OpenSearch - if query['operation'] == 'query': - response = aos_client.query(index_name, query['field'], query['value']) - elif query['operation'] == 'match_all': + if operation == 'query': + response = aos_client.query(index_name, json.dumps(body)) + elif operation == 'match_all': response = aos_client.match_all(index_name) else: - raise Exception(f'Invalid query operation: {query["operation"]}') - + raise Exception(f'Invalid query operation: {operation}') + return { + 'statusCode': 200, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps(response) + } + elif event['httpMethod'] == 'POST': + if operation == 'delete': + response = aos_client.delete_index(index_name) + elif operation == 'create': + logger.info(f'create index with query: {json.dumps(body)}') + response = aos_client.create_index(index_name, json.dumps(body)) + else: + raise Exception(f'Invalid query operation: {operation}') return { 'statusCode': 200, 'headers': {'Content-Type': 'application/json'}, diff --git a/src/lambda/embedding/utils/aos_utils.py b/src/lambda/embedding/utils/aos_utils.py index 41f7e0af..6b80cc13 100644 --- a/src/lambda/embedding/utils/aos_utils.py +++ b/src/lambda/embedding/utils/aos_utils.py @@ -26,12 +26,12 @@ def __init__(self, _opensearch_cluster_domain: str): connection_class = RequestsHttpConnection, region=region ) - def create_index(self, index: str): + def create_index(self, index: str, body: str): """ Create an index in OpenSearch. """ # create the index - self.client.indices.create(index=index) + self.client.indices.create(index=index, body=body) def delete_index(self, index: str): """ Delete an index in OpenSearch. @@ -61,10 +61,14 @@ def index(self, index: str, document: List[str]): logger.info(f"response: {response}") except Exception as e: logger.error(f"Error indexing document: {e}") - def query(self, index: str, field: str, value: str): + def query(self, index: str, body: str): """ Execute a query on a specific index based on a field and value. """ + # extract the field and value from the query + query = json.loads(body) + field = query['field'] + value = query['value'] body = { "query": { "match": { diff --git a/src/scripts/dep/README.md b/src/scripts/dep/README.md index 2ca3445b..d54876a5 100644 --- a/src/scripts/dep/README.md +++ b/src/scripts/dep/README.md @@ -9,6 +9,7 @@ pip install setuptools wheel Run the following command to create the wheel distribution: ```bash +python setup.py develop python setup.py bdist_wheel ``` diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..c655f299af7fde8f8034c6ec8bf46dc2b55c78e4 GIT binary patch literal 16522 zcmai*bCf09vaeU!wr$(CZL6!Q%eHNF*|yzfblG-w*|zKTxo?cK_ucQ@cQeOa_+x%! zM&??-$e0nKC<6kD3IG5g0gm#D8h<_jfdBRK=fLAa4vCY64+|9jCjq^Sx@~4fErPyj?yJo%A1R#f>cvT5wT zKipT(JZ8HYK7-kaw)60N5!)Hr1U0BS!OK$zOkbBLm>T8 zn#ep|nU}|l(O0}IULNH`^Q~GYbQ5%$GFd~Z?wY0vLPk{8PI4f%;qw zdG~K%8joC}Swc{DG0%;Q9wxu0X48>)c77n46Wh=%a6xht;YHn^<6?Go0`-S~Cp_R% z$R1_cc0Lsn4U=SCvSx#J5Nxp>A4aiHagF^=a%OA zXhqjx6KCy@c3IuJ-Cj@h(If1TC30CIIcpn9#dyAB4bXk~{xe>Dd6>wO+M+aeX zyv6fcC!XV-4W@Mc8D84lj%d=l;E5x=+2NBymG#a7Ux#l%Q!~b72FY%nyXhpVX{?FO zdcWrQPJ0HZafTa8R(&%8gFymar7<^MAZ;Tr{@JylAwh`3z@_BQ<}jn8$6Z=nlMBt+ zmSnCs4zrn=$E7Yl@@6Hz!pqe}<&EizAGOM7?Oys>pn9Ix5G#yxb;zuneMHbsr#1H- zfAXdIm@Japr?*?C`>u94{f@Rf0{MfUWDnKB(B!OqEY16!@TE_@)sk%+4AUpL<%Gax zbt`wFxlcU^zy+=ZzC0`Tj8Wj$LxouCHd z#~S2>H{P%xJMLcz@JVI*1CEiyh94(LH!y!mK()NSMt^)WFW#TOe;+jffcp;#FtfEV zvNbXLw+vipf7#&auX>E3X2BnOm{vVPI}pG zy>+f;6OYQXhm-U}(85aLhVTR*Ng zl17aZjwADPu>ph`=gO4=5@;-JRxYz++pmRh@gRTp7!)g*d6s-X3P+()EpqZ0cc=x7MAyJkJpM) zFN^*lpcPS~E6*;>Iw~aF_}k0EK&eh(Y;lna<<-5`V;g3p9!G)5VLP2JMHGjYXo2?% z*mqrWSYB~Ts(b5jpj3z>EJz+E7}`kA%LH>rMqX?bt~Xj^sliZmiELDq<(MKWEJwAbt& zdncv2w&&zc(~rOQq(>;twb2@^P8-aRWk_-bql=>{97u^{#Jw|%aZQ~LMbZi`gp%~z zQ|I{kuUlq>+e(s_&4j@zeCxoJG5N`vj)XW&zt>X+2?|o@_NhQRdOBbZtrJfOX=a4M zg7wYl?xl%T%^{;lVBUo!cNo@=z&H^k!3HPpLluW;l7!!K9_`5QwTUdjqDcuewL>%U z4hRg2R|WdTYR0^grm_y%xF3Q6Ic{9VdvGqR&7%*A>Q}RA0e%O_;T*pVz4Re5F}-g8 z{5WH{GU(~_``DXJ{!P&N{xqA+@L22TdHvcvN6hit+vsa3tWbc@>w0Z4$nDGX?shO} znx7ypbim3Wdh2Js-yTBPmygNUkPi9D&S0DbgOBswDGx-del<8$mFL-X?xyF}!$<8F zfu4t@^TqgVOeKdvm*@3mF?#|_>sGh@b6;bjpEqB-!QxIFaPWGu%kj4K{sRj{ZOz|aofikhHdx5^uQB;0XOvrA8JRRv>AlQz?$dAI9N1p=@ z+(oy7#j^Z{vQh6bMxE`iICLsYz+vP89iy&-m1!m=X0s_a+4@@CY(3{wwX_{^bRSF9 zXiBfZ%?k!+Hj2S~r8IQso<{yqWh&QPj=f6@i6oo-qA@y`tY}t^dihLMbLAP0(!<08 z5&7?J9@wrKVjpVErSrJ7?50zfITgP&f8=9wWaltu>7nV_ez*2w7Btkk&F9vF!`$#P z%ag8haxSISK?}SCN#hosro0i5% z7<*IXBQ6FT^-#htiZ#W2qb1$MVQVwDF*Xk<&8*2nh{1M^RIWC&t~9d+|kPSVX^ z!A>|FydG+zqK32`Wi;3?tV)U?jri%Uo8Mxnpp^pdFVdEVfuh)B=5sJ%X%Rug8uC1z z$DMSTMt|QV^tO~mDdd0+unpL)Eipwin$zqyxsDx-BocuKlnG{!@hz**&S{reH$_%f z8B5cLCRrPGb|kH!9gP7@~_a*`*kUra*!W z5c-c)4yc7f%N$W!Oh}{UsmjFHNH~A5LpN!d9Pzg`ioZsdLcNr?`IRw6Sb<>55iu*(@|5d&ww5^U^bBk8oWu25>CGJ*ox5FJly z9c%n(%y7mB<|T`SJM8Re)|xfDSPBrBpx9t&Kp^GHbZj&k5K(=ou^`6kk6)XOu(!!X zd6MOJhmIL5^73OveV6uc=@?6MmYgC5vch|vuiFU1WbsJQ?a=SD7IJX{vl*; z7L!ymszzZUw!PhS3Rv+I+&$R2aVQ)C=_YJ@nIlZ>{6$B)v%ejm9ezsk2)Y`(X2lnR z#*)Ful0YQ73<#-X1$vRE1*?$8sw#3RZv6_FqwUWb=WowF1*W&*?sDn8XL{fzFWk>b z_Zl-ov#{cc7%^j&5>{f|wE%~+O)k$XYX@eQ2S`zbxAaohD3(GK&-&rL8+0hP6#A}9 zSH`|uX0AC$A7R&NSnOrYA`LAZwS+YK z8>UUz@}sfo#B@x_D7+iv%X=8MU0|I;Gbcn}5-Kc@Wml*&ge5|b*2LRFOW-4Rca%a5 z0tGSltLkok=nm?s#CNB+Mo1~>q7p?>Pi|07pyB6 z?sZ}ny*l^5Hl`>mTm$_j34DPV2-xUqSPvGQ5oD63|F*K8&~Gia&K;?qiJ-;@BEc|U zXdwaBH?^4ND?8NAF5(1IJ2@js71XfJG%sv3XDu8Kv?6!Dq~}heELgT?5#n;G=;N8q z(o~aJB0eAZv!WEKs};2L55))V zl};Es`eL8+ZI>Uqg_EK|&bRxc8HFZ%5jzmaBqR7uYcqwY@OmN~dn3@EhC-f$ZNA#S z=~OmxEzylD-Vuz!m9)UaQ^26^Sth$;*7ojE1dl|9xUR!nOqQihfY9kgM=fN@ph7*D zY8I@gnWLG+G&6+zsOUvp^Q>at5lw*4;yWpWp|KDJFpdh^ly5AAe|b^SL8n!|ECHNP z`yGI?TJl)pNa(54`IHz$!(FO;6OG6-rWjGQE~J>7@8oY7rHl>lrmSN?;hK4d&9D6O zm<_bkvQb7{rQXAX$Dk^0)w)|9gXwH!Lb=low3ZG}X9<7hEFEWRR^7gsJAWO)VbDl_bLn|A5Qu;XQ*QWIjLI3y~x}VmOD%N`GysPPqb&w`ub5g ztH!$umgQ|>4mZDC?p?-ciMNZ21s@eESPZ_ciYg%}SFZ@gI<5d9UO7OllLA0gbUs#e zm?qaApf${C?6og#o_OjpSYlLAPn1xQDx!=|1)F#d|AJ>#c>(g5eL+rx3a79|Pd^Ca(nrC%y@>PPhm9z~$0&gJ!wC+v^H={!0uWyMM|)3mK2=^nLj*;_~@gLkab zi(;O5?-mqcz{r!JPY(!0aYZ2jhrP8>k;9S!>8eD7)G{3x7$XDCef?D+%+DecZO48| zHvD}|^yI1GF#LA=D1OU9H+(y|u?V!9;#=~rcHU#6Qd0CF^9vgImdn9MfVAjbk1`_X zlWypf>9IyLsqAS8q@arQwTnqHE?5J)n9sNA7P+&gnCdt(TC7oKM{enNPezOl9Y^bE zvl)Zo3InP=x$hyrb4TW7n2@$xg~-i~(7LpLux2+BPOvAXNiGBTM=Xn(xa}?O(xq9Y z28BS&ufvYZtL5H=#GvXn)}63iT&x{2uQn-)9^{Vviw4_4eiQDi+j>=v#&N zPndg?obTR+oM{tk)*U)N2*ur}oVQ}Ci`Cq93nvf`Q%Y)!%W4ap1+6ZmInIUZ>KZf% zPjR0O_P2xhYde`e8=1V+$)CpMo?6Nm-W4*fs4cIfI;wgB?}X?d1b>B999mi$F1yG$ zxnKZ*I067b@XxTy-PzH|#M#o$_TQ0JjE0TfIy>TLje!s)4mn)Qnh)DHkXEKKP$K)t ztC=K7NQykWh7_t~g`E0+&(&p_8_&plAN!z}&*ddiafLVzg>={NN>2jHo~DZ$CTNj@PYI(rWDmK!F8}FmqtMDZS;Q@;ln*3 zfFs%EHOEI?L6ohc25fqq1a>Bm$%Az65B+2m&x~oKl^=*PD`yk3$fg~)^lr7ePz7T+ zHgB(hTotZJ1PPWHK_twoh!(G~7{8@Dc?P(9Qcnd8lNl=?1$s|$s~nuD*?l>*c<~kC1#M@ zC$6mPhk3Try^m)%=e+i*JM)7{WW>De*R8f<j{L1*;%qRD2&1|J@S(*v4&eoy zSl9UR_=sDsJ=y4L!Ku@@N&V=*d|BXnZlv();;werk0c4Gv8A>^{&d&m(p|= zxA`MaD};I>5VXqY@Gq;|V7hwj#h_x?;xve}ctj%X?}sZRyCrJhW#>6wA9sV^uKC?P zT6hgHC*8ZrORD3{&5(HpGtZW6D7ZONIWXRFqq%%9YBj)cs!Dh202e`A$Cm;bQ^CYm2f5FZ^N zU}tfGsL3x#!FNE^3?-w+x>V3lV$d z&DKRkSt1fa5$__5F)%28yeSaR#e=2R&8E5y&n=-kvhF;jfzo5q)cldqMq03FXwS+V z`{1~%k%dfeAFLe>A)*(2Jif3gFL}V`s^w2P<#SsU}H@aVDnWQ}c4{tQ?g( z?65DJC0X^InKvyW8|u{RaRDe+tiS$%Lhs7k(`{XZn?iwkJsXeY2o&b6b^Gk2NM|oI zvnEy*3Ea@SusfPFOp={}T!f-`VN-Rga*8y4$oK}_{AxHaWB_#0Nm=d1I}7t6B0-cx zQ6gUmg!63hFm1ej-k2sbSXtYC0_*8VfsT_HQy&R)JsI)PBQs&%l(by4TJtk96Q+T& z_WdMQmaA)^YR4htOcHa@NU#C1@l|gBv!p9E>K8#i54xw^>1WY`cL*Q3(+TO}HA&M7 z5>{}gm>+*`!HH3H0r+pCIr_~D#*VHXn=guP0pdHD&wP?6wq5%EJMC+O{KhJ_P`hpK z;jAZWN905nymmwZlfynN?Xhd<+<_Xir~*s5Ahsbqu_wbXVDc6nq!P;z*7%Yga1Ngi zce8f#Ya4K$fG(3+7`-{GJuM8no(P+`t>3;}?RM|TZrph1`fNrmaS%@Y9%>la5a4ku zG0&^UTv#Jtf7>-DlQY&hBTYB;LID6Ag#ZAYe`ve4osp@TNFsNn~DDc6G>HU)#PIY;`+@>EQr5+ziF`(aot;RR*v*VDS5HzFa`y2IN_tO!ul zpm9%K2Vl=R8-t9-ogu|e%GUwO9OJdL?AsY}t`m{xII+SiH?@WouSvxl$a@mr!8f=8 z=Qkr>hPQK0^y{aO_UJoW>DTkJeKFNTvE1z2FIC=5CocAghNu_n$VU5w>B=PY1J=<( zFb2^!eOk6D1Cl-g+-|;-950Be!DP?nJ%ilOZkH-RMVpYZdcLpKH-J1B`xM&<)|WDV z*H>Zq(isF|^DdM>6rX#a3_nBZIOmSaws>Gcetz^Oo2!0jlyoeLu`{fwULn z5L&WD=0TTijG$lz`^GCqzPI5+ex85qyz$d$)EkadXBKWUx*Mnx7I9tM_w7~@t=e~; zG)9YPq&FsEVBi|uMzd^lV`9!h8`BdjvJPtOltz^!2NtZ{b#fqwTN0N1L|~y96{jv< zA}9$BW5rGq`(o}!u_7lC(e7dH#c0d)$T?t6YNMDzi#06i{GiNqUV=$x)Ozkf4 zrGPeYHbu;r)I$rzsyudu1`+7fxQy)hA`&hJ{7x)m9F4;pZq*VLy-!O&`K+SqW;2b$ z+jV*u**~`i#pkDC0k_84On2m0Wb-{PQ?dAEtCigr^czpvJ!>+Pq_%!@Q?&z-JI`Ed@ zx$@Gw5$KS@{LUeU-$C~N2-p3$G0PK`UdkZK6|!-)ZpO^RXcOlo`{;!{F!NA3mWo7V zxJMF*)$r%Yo~88!6t*4FT4{T^#)|!=LqlMMuJWvWj^5vlBy&YtHI#%Ww8AXDVn7*& z-LW~-vmSwP8TM=blAy_ewOS2c5>?On0Ieii0rC1eDBc49YdNL;K}oNs6wPF}C>AAQ zSi||JdSDi*mz0}3+ybiYJ?8!u6v%x8|Cr=$3MT7Wdtl>Y2zH?GS|jz8+Z&JDt%-0=bm5VuibM2xj34^ zBUqr9Q#GuvOU3rZk;j3LTkH{{!;5I4!;u9BVSfe-*@xVYNoqeReMCvg#3a-M!4m3yKrkvR@ML(xk z_3E2BjP&?Ddy9};O@cULiC#TO%|La62RkDPj`+tw$D*KsSesQsGp`-S4V($Zbnvviawhf>ozlNo+4nmQp7U7nnjJu(zEXxDjFdTtQZ!~wfmdz3)Ox}-wfnJnE zvW+1$U3A)o5i|W92siSLTP+Zux?Syg=|p`>{~kme&s*xXn^^s7*U!OND9nBp(lY+%SPmmhwXn9*{ z9_Ny4n2(HTuU~q)^UHa6vujMzqrcqI3dL;%udH`V!<#&LE3V|l!LTYXlw1%<%fJjW zRF5`C85$?{ccrz@&j*Pv=H*Qi?dJ7G{f6bm zHuKf_ReQ%W5UE%W!|7g~jWi|0Pu?!LdFLH&z;1I0=>*Zmbr&$Azniuzg)wC%lTK(` zQK-g-5J2lrDuMhH>=8ad2ST!aJ3e^&hwg~yB1LZ64OK6JwuClUYF3pAj^db-eTDf; z$kXWX#~}oRMcdV6ou-{$?TqW`dfQHm$5?d^0JJDWBhMDd#Ai=W$DT^ z^BIn7k+q?DE#X&%VokJyGS4ay8^NI}wYCdTUbo6NOn)7h4LXrE7d>3_hBdmo<9_-{ zAmbHajZ{yh8900Ftb}x;&?0Hn>wUVI%KK%;kQ)D(7@fuAp;s~sF;cr@f@qwpDE^eK z%IH~|@gvo+Ufx7xvSlBjT*7|Aq1$crXdyHreT{+%6{oh{%+_{*gRBnL8?0Wg**l(j zc4p^R!l%+81_9EfX!53A>k z_|li_b@ZmUIMEITi>IE4mfC~uFF!gqtdawNZTQdA2Y!mC>FN}Wh1>q`FA>_ludWg= z`SM9j%QhG;rhEmVfslr=t{JM9-;&7zk{aaS4fhPl+Az}x6ydHQ*jVvR#M4e(F?hcV z`uMv4zE@H^Jl&|rXC8N=E$NOyHVVL|!^+1?uV~OQ^-vVvn)(%U&>oOFE{PkwyL3pu z&J90 z^pKRr-V(E*TII1nr^-Pq2!kM!*hihAa!Gq^ zM7Rd4bM2$^t*jhW0l#vk`S&dF3&AFUJa9n-W?=C1lGTuIJB}hS;N2#$37IbuSE`hS zsTX1&kkYKPP;0 z$gQjvR|%%Jy)5)tr$vmmrO!imbGaOb3X3C5c#~r`hpSK?yusrtnj?7L046f_j$4+o z_C7#_esx~nnXVnMgkufVR1VTsNymv}xK%n4l0QGqb5Osm4-boFtO4>m<>Ka4*SvpW z_!I?_dxab!w#8>`IqQ*Xk+HBvlyk{)DnYj&z_LWUKtoaN@3--Aq7GvNlls_ZSx5_U zsnMl`zHDr&2iYWIxw+cg8HjjTOCIAb}oh}&cSmsQF^wnZqW0xZM2o?S-@C=Wa0*9Vsm1iq6IU~3(q$HP@$q7v656ZRD> zQh5iZ#V~Bfd&?V;fJ<3!z#6qFPp=Fi)m~g|(Lfq$rs(E1yzad0Zew=t;bOr-PKYvZ zG*N5PD$jMY+k|&p4zzTEQr_@2VN6Rv;P~EgF(MhppaUtIB*kqL@4P z#m_xWyAvswISpZ3-Pofg%yRvu4q*W(U}X;{9Y5{2$g zU^-L}hFX1tL5vJLN;IB8@1gJPC4myd&nk+MY%O1K_?v#fsJliu=wx*^2>9<$^U*;S z8%eXYJFV^>*wdUle&1*M;QRZ0%kwiTKM`xW^d^&%5cm`CA49#cm6_QidZd93CXL(u z5B;wYp4grVH1+iM^Ar~3m`g>&x#68WdS7+)KM`VhG4YQRzKXctj5>84JE6k&ij~>- zWmcw$9V~BR?D+_=H#xy{=G5MPPjsEpS4)_RYoD`zjzKL)<c+kC?MuU1S-z|E34;)&BL>b~f&{`_{p@8E;&GPvnf1 zy@$JkbpV9o?OW7pXcaVZLPvM863iDrp9aQdi5WeF?49+-Y{|>qY>2iUQ{R*8^}e?J zU}z4vI%Br$mG35xWuo0h-?*Pvx1*%2$S-h8dc z)N$A%0?uU?3m7nF4?B?jE$OOzoNDP3b9CE{_?4hOX=+*15a0G)rB@9YjqL5Y_BPxC z)4E^%5$n;m;}44%3H$be?C;*wwB2M=o+S<+Yi{P)JUt7B18@U!VUUATlRUHR9rSjD z9Z3nR_ISC_71eUdkZL6R1!Cz(5oHn|Tmdsfy~!YH`o~;8Dku&t)7^uLKCchPcC~$q zn@)8Sqm^0_eeej)d5jKD0nMa z@i0j@=!Yd9Y}?TuxN#?;G#I-yyq1}(qs{u|9B0+< zr7cBDmq+?NFk^lyu?V#&E@8y;6-S!!hvEeqE0HQu$vkOVV$nsXv#8RJCyQv_M= zz_Nl`zm2Z3a{|G+EUX(^7&CUY{$)nvV=1AEQ_ym^%s%HAX5nRIkRBeL@KVSDHXC_x zZuezBxRUTSe+Vb0e;$HBw2pDd72nJ9kNeTx(GCg<1$jO`(UgGffc8qj&Cv-eMp2FI zj#ao{&YO@uzbBVNyyi7Ir38o?Mo5X1gBV5dQ2Cq&PU^DrmPJ@g<9FXt8Dn-?TA(R6 zv$M-zz@7UzFaTe!F4VJAnS}App1t)ydgM;9!3ooix1rhedza|ACPV@^7?WtmTv+v} z{+kx_d9N}gsgjof_o~LyWe-=JMP!BkQ=~%$P4~gB?uc%|Lx*O*bghy88 z`UO1p?8EHrPune<&D(DO^4}JGSoNDdI)Sfdt<#bsd3ch$TCB+$KL#70z=qZ|fgok# z-)#)*de~V$(C)$qwP6cfcqxS*z;_J+hCvt%8Vot5i#<{&hNoY^f8}GG{}jmfdY41} z00jU(VgBC&Stpx+@7Kks>-@>Xp!zP?aw;lmo}w0eVKQRc2jm~mz*<7kda=L;q{N!C zBbP_0b}n4|{U$1hIVjQ!?nN4PdYN*gSH%qNGq2J)&pQ{Z`H2cGy((1OR<<3>hQ@xL z6Dtr=kgRLrz76}GyyCsL4N{o7H&fjed!Z29fX&X{`>Uh~^S70jnkfe*&#J$#bxhpO zaSF}FBgGySyB94qvU1@99?o^NJ--~WfV%u*S@I6%prM`m7iEZez5t8T!U|YGfV7dF z_=IK1>(Zr|!}YhdO_o8YIXVwCBs;kI%*FNa@(o4A%6*%Kk%HPytMyUbsC>3eF%yUV|_oV(xd%Iddm)C(CR*8`OPD{BCl~aN+h8qSS{FR0t@(OF(13 z!jTS-JP+_dT`~jCFjltGVbCcsO%?P#9`q@BcGC?Sy#V*E=VsFNv3Gndts-x2Z)M!Es+atjJw0>t4C-K_YXdbj-pgf+H8{aY@dY%rFPMaNOwsu!}EVXD-qf zI1%26>pfm!OKUa?hEcVTm#|24FM@=+F}xT#^E4Xa^$M=|;S-hQCZ*;@_$(_F_H|$8 z49!92oeYdpDVJYB6-S}Y3t2Ikm6~;aL106*n&fr!71Dn73k=Iy%;^?uuL{H*{V7Cks?o3IA#@WV(8}-*Z*{n*VnygDhTK=&5 zR3;Um!rb2I9HWwWi4Z($5o3L|SRhg>X;XKNdBHRqZ+CRQ?+SOJ!-meeq2L*Cyy7#&d}N=)ZZo3Umh zKr%K7uqJ~iPpr*nK9vfW@zgm;EN{Qz7JrvjrE9&>S)e3h`IE?F=Y>ROeUnaALK?ITSaPty3%h}x zmD&1cX^TV6$~Ed>i-I{H0@tsKy#~ zT?O+o5CFgw765?vkKBWky|tyY^WU-$U7DMA>n-ShPX3E_=?tawhkS( znx45?y+)*Khs~l;q-x7FuRT{;BFe}%6gCYHuSEGH19vYx1c_A{+BzkJ7S(5wt0E+e zp;E1H(s&mfH)12NBdQo?JNpIAmJpM zJs8QuynA%LsoYRu#F*Jxel;&OUq-N{io?bwUPAiqS?%{18C8HkQJHvXMH&xfDBHIk zyQMW*GhC!>20U((TiU0_@WfJ8?`MGFtQz+XffEy{(tn|PZqSJ56TxGFQ=(8~K5E-Y zTecL&I`{h1^*bcy@Jzca#qa%)tSUq+M!JJ&^OXCp?+lcYZ!+G37D%J_RV@4kW;e zEV-#tjkzvgcY51yfJy;U*>K`ne3Npf_1^#DG2{*p_v3oNcKTsx0?FrBT;6lHItWI< zeJi2TEpoJpD@~RoQIKSZd~ypA&wP~%v*hs}YJS^*KlKHD{gpa;sp4Py|@m(QD0SqqI&awn6QK%qt3Lh~gG7YH0Zl3>m zcby@hUjKj%sq zEyKa}cZK)(_nBJZI7{#{W-U9+EjCjp)G<)C6Qz!DycwR_aUR|p&~UedMbgXP=PO;7 zBQh1P({i#J)FR(vby0mG&Ipk^>4cmL(y(1A7O2C9n%1J;lFKhO)i4S~37f@HvAjT# zj;tr)THvo!$kgTF%s_v@ZRkMHq#VusXhW;A!e?x=GD z;x3>+kRF~XI?_O-Zkno|t|lo(+lI1vT6etu$~MxICJik?1s`Jpp`ALE%rNSl6i7h2 z0`sryy`iDnj#B`FNSDS2<^R>2-i;FGjQ`UPN~bmN!GBz*=)b?QyyB#rPLx5u%I=9KSwUeB%W>!-Rnx<9l&a1(0rw2;;( z)XXp)j+2XFMfGJpb+rTk=s>Ue$ca9wDd7T_>!{(`?yf@tjlp4Uh@p-Z4vXK5W6y8% zWDWcIKw+(F4u-3r2`?e@M`W8GAy^APtDe9BK!ogmy(0o6%|TH<07AU<0*X+eK`Cv5 zV|LlzU~$R*?d52_JLq<-N*$Poj-w~8Q=NH)IpEm?XJ2Q*u;q&`mm@4C?{q$CCoxu} z(OEc#%Tt=&n$EVgMN0h;(zAGH@G3^!8dJ~0 zl5!d>NO-P7nzT;f_|RHLVj}XO^YXS3^q>BuqZaAMmdqXr4WwQP@_0e~KMV_opnh%k z2@|g>oJ)9`yLY(CrGd5!g+Q2?19TFuuDRz}wY`&Ethje$XE@sSq9br&WWn3%_1(!P zl^bAOJu;^hhz}A?`sfwpb>GoJz%^QtkW^14HwICD>G4TyTn`eyGp;B%2Sp$^6DJ$* z68=P`&fl{xJZq7npJy$A@r1RknFD$=q2q>Y-!v%`pC0G3bo9#c^lO9OEb722SxkJC?~lB%fay#n;N<|`h4!s+g5 z*c~N6y>;z#q1Lq1@e90KcFp7(D6$(sfuS#7$NQ6q*Lxb>PG_A1hTb%<*JQvS{A^2==?l zZa3Rm5{58_5@PzqVUn1OAO4g9Q6SMi{L zeuHF!uzG}EwlB)gOb~3&6lZ~5D0KeFY-r74N^UhEzj*o1?Gy;Zup`>qOp!!>Pjjw^ zDvh*Vl`O;o)QKWbB`BdX-hvp2*v zi;gC_C%ICA#M`(w(YP?zBCe?nxx~*HA4`oQ?jgm8e*k}mmhG)!(IB%$i_(=>h~TV2V{J#hcBB zw|~T3(D=M>>Cx*@K>73(O)v8b3d27}CNz(`;)ZufVc{);J5c^~ecoR>vv|`S?drju zhC!o|8IfxWt;L<^C=XUbvWJi2~ zgjgGj46R2tyepH2Y?J(3&bAUNuiD+wD!nT^hiB^useNog*RRf>{!DR-)MiAc7PalJl?WdPs(V7URzCfcz{OOxBe2bu$miN8IM{mY8{Vx~*oKdb-O zAo^dd(2x`plZBaW?W#|c2t;@p_8T^WoS^Alpdv}k6?f(SDIm* zVQHCRncasTn`W4Opk0Eaq?4W;m2FU_q@tEPfR>VNRi-RsU78%9mRXdWsNCNL{>KG| zPaSZ9|Fb06KZELjS(>w*y@9ovtC=;uv%B;7=xB_>7(;}D6y5eep&=IGp91A0D2~vnlw|mfPla2+8F%R)LKO;2&L5jw{uz{%9iZqYp@AQ+uJ}WQLUx$5uuNJf$yniMy3-z=Yg>5e5eZdO1}cFR z;z^4dh>tEy7}s!Q+5?ls+<{?7_d2O6iZDKI=A`eMd8Atw>D!zxl=6{N*IDGFwWrxz z4T0=!enE|UUHaMHJgN>1F)Zy^-03~4t{?#f}Tma}0P7H-gnYNRyW2f}aSaAx=FgGg`h^d5L? zuP%Bn(PpmXcvK00C|lGGCMJOWm=52#WpgAf%fA;$Z=HT;eeh_TwCZW26xS_L3vc$+ z`#lIh65#}Y9`nRj6^l|xnRQzb`yps&A~cNSP3(n(8?3vt{<%?Mx#Y~Db_yH!Mm^jq z%1@YiKXx!wp{6Q6lQK;_PWKGf%EPNX$6Y;mz;xqv0mta$#+v~-Vicip+N_+Lj;z`x%9hsNi>L;h3a^Z!Kv08IfL z|A73x4f^kt|I|DA7lr%JR@2{?^3P7nzmxt`x$s}41f+kE{@>!^zr+4hecE3zn?Jqb ze_z%=s@47u`cIo?e}RZ-{yXSzTWEiW{HKWZFUSeQKOp~%U;mE!PfqhMRQ;dZ_IJbn zy9@n0=Rdigzc`JY|BdrMI--9P|LF$)B6@KB|B3(Q6#j|(r*!{?L*@Se+qP}n_@`~#wr$(CZQHhO>#qH9YoBu-Mx|=JbtTpLx|1kK1B0Ld z002M$C`gH@Y1r5of2J@!515*O0@co(d#!Nf z5G8xgAyd@I%b<&QKRZM~dRu1{se+i2U-P-kEwoU#%Zj8Owvjr4nSa(RG`7&(?@n25 zPONq{cAkdpY|w#;zFo^wX%jM|A{w#QaWG9Ji`~j1Vx?sQWWt#aq8p6~ zc``D4nF>9$y@lYcDa$p3hP4{G!rsX5Zt42@`tUOvpb{c#*!)ITpz2H~6N-Nu}%Q;ndXkroD<=*A}! zRjC%s32EC~r=RjgnY8@SZsDFLA*}?c7yU7WmZT69PPc6+cBbJkQyX-TRh3 zhV8${Mw3A^n75yvtE9cT#kVHvM0dZ%aNIy-Ym~dYpReQ7=cAYFKC^moCG}sHr_)&JIcLM)S1;mRLZl_*`Ee?jILl&aQUA@dghHg$dFPvS5*pc`2Ets$TtU=f)l5vVF;ogoIO!k_w8?iPsRrl zT;A9P&F)Hp$9TSFNwshiOQD2*C{0=6g@j3<4Vz$2PF@}w=g}zEVN&&S9d6$*WbQz} zu>~Rk=G-%jJO;rDhf0}DQC*ZzMp$yv1GnZh%!DA$e4h2=(Pg92f9)-^Hjdamv1mm( zE&B?GpSbg7j0%T2^TE@73hFig9stPaA((dBJB4;Sd{i;P00#mSJy-L^?G7j#W(oF7 z{qDQ>PgM%;+!r{^ggPOt*a&5?3PuY8@^sg0xfo!6o0$eN=~Bim=q0;nIA+PPK*le# z$$hxGiN%3c5z6*H&4gmX9{0`C#m2D7nI8I~vEZ*SnXeFHB20&81+|It(<@MA0dfTq0^jMwFBY_>~;-?pT*4aP6zer z?ckFMNEVRzD%!oobUD5wj*>T|cKr6|f-2|I-}JP;AS(by`XHlY<=Jiv*;#+l}= zK#Hq&ID}60L@n%VyE#KA3tX_{?HK09A?K-X{iz;0ng2}kgXi0~LVId!nml^F@p*)? zR|V8h`GE}52{F1`wybG$9XpU#Thdec_3eUbBqY|l;SdW{5Re)fIhMgoa^i^bA$C23 zMgkz`ZPlBgA;~doRRh^D?8Y6CQye(-P+qB{Llh`{w;f@KQM!bh3F;!qg?0r%MgR!o z^%trZE+cuzxwcY(@DulSvrR3Z(K!NMQ#nZz#$pXaQZCd6QMCFnpxi17XF9zSmuPvt>=x0Gqw#6pzU;I1#s zP?}qnr49Oz13lXW@~NTKs*49=$6OAfpQA!?i|;`3f%e3LWFpbBS_*MHxh(A)_u(k) z`e<}1D)&aeY^@A0d1!1@?KM90!}+S_F8BBSNhbW{YUxbT`vm5g?8#5n|87M_t{`C}Pg2rO{@9J03!b69wRnsY4JPni zQxprjMp_>{8GP^&>Zc9c=-s|O{MlAx0_xDsrZ-Z;cWTmH1jls51IRJDxIHsJ-arSU+zi+64%2us&V?6;^x%Nu z4FFrV&L6qGDe8mW*Tj)N0blsI0@lUTt<>M&EXkumHuzX^w(_~<9b5+|<-yqI>_Z{^ zBg^&+%VI4BD7u)+Y*+&XOd)>^z=Rf$PjTXZ5N@InVJs@$C>BsIVImfh>lRp`_ou}_ zJ$--*{}(8b6Y@&CK>`3A;sO9b{Ray6jwZ$yM$Z2O3mP{60}H>l{eJ-p2qKoQE)Ol{ z@FO<<*-0p1NMq$o@Zf6*<`J7JG!a~hjZLtRdv}RB{v27?9Dsi(@RjO=xcg&`N_e`zQ?JU(_| z(i(Oh7i6-l>bRa+vvX^J$^>2zHa;5}+zRFV^8 znq496+GdQ;X{K8I=x#)N&dg?KcKjeBp57oEH8}OnLea!9DU5oRL;z5)%YC7+zD#fcl<09 z=16kGmJnMGe&xwQu%m{f%#4(LXvaKhCFv8Q%_Ip%Z+v3-w1$GdJgK$ml4UWP2(w;@ zxxw_So%Ac?HWpp+!P@Rb&EzXfb6yj-`!a}l8T*PGEep<`r8 zC!Kcq$tuULfey?OKBuc-7iJ3iK`yH zU_l&9>VN7Me+1fdr0o2^$ON^pAY|*k4MybrVb{rNGHQW|LL+kYzx@`I+y&jN&m@|9f zA$O7O)z4~iXfYo)2%vuA6Q68VRM$*ukBig5DcXENu~k%iP03~+H6ZF(25W((su^q= z-{uz7y{Mir^eb-rC7_(7cTCBQ!~Q8Q4#4=LCyvzXO-V2EReHG^H=3{4ASMw)u3HUx z))vNV-$;${q;gL5D>b&-*pY%ZLFJ<61cCS<5)wWDg^7*pqh7tilKW>@{sOfve(+Mw z5FNEAR{WdJ>(gcxq?n}kGZ0Uh!!2_Bxo{(9=>`&IJ7jcWGbCSM$bqO$@%Vu0?5>l& zawI?yZiO6LE*ihNRY-ZgZgd&hL`+tUpL|nJuE4-l)==#Q!Js_J9p^8^`;l;7*1LI_ z)@g88QVA&)&t{kbq)2#z-q(%*cmi7r60;;X!x$EcY;?AQ$3wdJ06ib!>F{vU85iN@ z@RF3tiIwP1W2}@wmYg(TmgZcY5At9|$A#9{5N~zfsmgrRP}jL^Grm<%2uT)mpCit7Ur5zxhXxsZ3uiUOf&T(GEmQc(0*JT;OmcxO3bm zEx3(W&8r58(>+sU7d(vm_X=|U+WqgE(W}k~yaglU5K9zmKSNA?b1<2LQ3bu|XrhOp z1{2VXQE9QF^s^W-U%EyECF1D~FxnMv8>tT0&=I0U-a0dzaq&O8Y0<`}a$ZZP8tRaP z)&~&>Qu7%qiy$GgTiJ8$z5ZEmw!ZACl8^U(iKmf_m_!1zMX>?DC z9>mGNLl6TZt%kj)qAAeD$harkN;I~j0{eslbSSPb7oW5E%V_Q43n=|IyzLfa zv~me&v;a%S<@`%ed_x@Mwk*g*5>1nBh$lKd{9WBV+<&y4?k`(e=Z8ZlV^(&8=<@0@ z?iClCxZxx+tM2EXuucW)rL)L0#S~O5cO*kVy_{)$HvMJgRDU#MNZlTO4-b8Pz8A9( zFPRdh>O_xpelBt*%+u@NB;@Q2KPzQckHkWil4Gakr`T3{be`fxDNV^c&|1GMs6~SS zie~DB9Ld7`h%mO-mk}Zb!O75_>_dlMmfk28=}c>nrc1Aj6G5#-_|Bt=n`B&L|Cl*^ zi#2w8xxC(Zt5{Mss+I9et3^g%H48Z9#d`8Z$kR2C8LK~F4aNJa2^^L>5}+wV@Hgjv z&yk;yd`NSmBkZv?-@_GkTHA=h(df%fGU9XX-W^_tINW&tp1+G%cA5IZRDN^5+}6y% zmHG7Fzu%zyGPEs~PtEj-LlA631G^AW z7i-#9vO<3Cs`<1xxxUe*w6_rC;ZS($&vP8%vbGXJsuES49*`!sE@mc(^1M+gu$0=2^aEAZtCs7c!h$hT%DT)MO zi|SNbWKJFx{Q>{qc;>jq!IT0D0H6#H0090Uo;jO1{STYPsLj}JaUk?!-*Y2$Dg#-! z%oxA}HVd>(0D&f2D;U2afq>A)Sl5#NBds8@jd$B46qQWAF1`{bf;-FGg9%eap+!gk z*KFGHPL&mOxRpcbkQ?75Z)SES764{2K9lB0*OyfHkM+XQ;pZk+`vE67v}2wXL(G=i zRI0+VhiFeE7l{mNa57OX8*{##pi#_YN5TX`X)BbnY6AIpyQY}jAxrC|8z-@a<=RPfg35DAIjt3c7=B`LPFb8cq^dQNF)tLYW zSrO*>XyuPt=`4KFov2Ma>Pxm@>$d4TCuiZWv;)AY~`c68ha5`Lb;gV*a8ZxBn8oATVgMsPZYyXI6;t$~Ky zttzWFBc(=d{Xj-<>2t|;WY)*s27Rcm-qHLg*+uB=0$daT0=g+`+L>6QY=yqyZP z?kxoih-VM$GI^Pe>d^(ac?e}$N3wf+m<@miswk8w===S7kB@O^g6Qg#u@}#J^car@ z2DkIM@gHIF9jcAUK&+Ka9f`Yu^bk@cwevQ$H6x0wL2^9ddz{&b>63mXRbqMYCc3Q+ zo*zKHnOqA{!>G(yILfrb<1J@n$YWP11dCwx#ym^~pF zcoUV}#Xv=`ePrdz9QC0i0Pm2;oVT)ga#7U{?+#5}O`WZEWh-HoSgj5>R62>a3C7vg z_4qxOy4o_`Q%ID0TJsL~22wv%;`q;94~8Xn9@3Y$l3 z#VnTL!(Vmu>n?v|;(Zmhx@&?vT(}*O9!shzD4t*(nswuv41;~^oW8B#V2#`85ctoH zU+Fy50ME9xB8m;I4%C3$m@m=T|#ghSyU&AYQp7ezQo^z_#nHAjsyt2RZW`ES0dcT zDj>bQcWBW6{-0p&=0}Tf@L#aL|1VhoSHuH2I~rKn{%74i8d`SS?Wn%&`+oxF1_DXA z)UgXUL4c^TO_J>XHy{yPg$ytu#15wIsUozZ8yR39cf4ou$rKi&S+W72gbCwLM{c#t zB8vaT<)%t*C+}w|R&H8$t)WBjf!Vp)IeBbaO+{?ipU1p9{(3bj_KAmul`U*nN+-oO z!%1gS7_&QR+f^yGo5Y@)r{IT9eea&sl4G%E0s^6uM$2Tj=quSFRw+klPy|d3Sm;V# zK20l?0pUgaCKFqqkpY(2;N~c;677$jw|h_TWoXKUVa?U7;tkA_C)y)gt4ZIy)$C$S zo5ZI^?T7E)m&KzaVlcn7Kr51O05-h}O`w;>TkQ&=aD_sfy zH~$A6X0`-GrkYJsgt<*(Yre8;+(tU;m0XD z9-H*J2YTcBJ^l};KZJZ|g^bgdovNoXmzRevt5SO2QO|adp5=DqK`|%XOsK z1~fk1_g#?v9+Y?@m|lf#Yj(S|@jver^d4J{YC2ZRsG)K=y)@ea6h{vSq*D<=8m<(` z)hF--nUx|H1cdvXccRVLib9i5h4JaQ-Nsk7vo}R7T_CiC9iB^EI`40kEQiR+WXrAX z`s`YQF2oA_=Z1E9eMXgtnMe&cCc=tam4+mE)@U33nDBAZR3g$~VawjcOe^vA&zdWS z50vo%e)xhgM+J~#jKFNIG2Qh&pJR?>iD9YcNWWq|BjxBW&(45 zLkco0QsG5@$DS4Z*T?$gssQQG3};I7+X}VmH`s;n*Vn<1-a7S>tb-Q_00wK~I~*25 z<1lJG$MSPA=@^S71@n2 zJB<~kh-Iz`UbGx-(2_;tbSFKr;BM!IKUP73YV0L$gn zw^%uvNx(amIcoO%ePG#<`NPJ34cN;1DVG0{uD4_2A_=HXbZZa-Fx8ohV8kAFFdaui zPBAFOFm)I63jEP;Yb#8X#TZQHUxm^(yl#XP^dYPVawwmR zc&FB2dO_Y+q4uFGxNOyzCkARnjN!CqCgRpfp@+6+cfAx=Wwz4qv!D+(YmW8wlSbhc zfy(_}^+&rrmJb*hQThYF$Z%Lxu)OS&e$^<&PtRj6VedrLE2(p@CiH|F3@=mae#%-fZi3S0ihzeNL1wTnY*`nWXkK0e=i=?EQl^TAKnEUX#EA z31h6cqz5#n)wL%JWh31V03dA#xyac73JF83m39dBLNN2=(xcfZT6~KyL*ufe%9-91 zSn@%ag-?(M8Xu$da96`Er~QFO!viK`O^^+^N_6LPnf-4|4q)EK$L4%Cc(r32z+^B)Ks$ltOhIzyrI|MHEU8>obc8Cgx zt%o5lln-eT2z23fl>xIMUzuoVj$!~d`n-@-M;5Jsc1&D>g_(*@V{|cDR72IkZHh>3 z_gtK?8sZnc5M;C$4*81r=AFsr0tq9C=@#Pf{CRm3ta7Fn@xP`v(SS;d7K*=scK|oJ z`G?4wi=Q&+6TsdHOSD0eiq_i2BlAJ6pQddwiQOw{6e*r<`>guDDbp?K&?wu@jm4kU-uQiRXa$E<;mFO#&Uko#NIz~|VWzL1%qO@2bnW2&K5Osj!bG2Z zWfBqh$I0>KvC%u`2yP$^#Bp5TC(imUE@PLToP3Y*-_bmCria*D{eJ-(=zjzBhXR6H z`qgK5&wu#E4hR5%^xtS}=VE5yOs}VBVQb;6r}rO_>BI;|4Kg5z-F=`6%R#Y9+~E~_ zBD795hNqc83UgNkuqY)~?Dw_Zsw$kZ#l1Z=uUd{dpr}09%l2|Z(`1q+Uo~pOMlgv6 z%>Abb7PV}k{L?-deotZgnXN(+p%*CfX!#5Xu7T0N0n~XGJWgnXg7-!H5Km@~7WDq7 z--7CXy+9c!=PvT2>TZxG?ogPG7RoenZQ&Mwwy;NGGWTex0bal6->ML8deicDEv zQ>VMB--8-U7}SfbiOC}cm&Kcx4Ss-Zx;n+>cPubPtEkM-A;TJy zCazmBO&d6i#~Q*RvQ@#hdGV5MG7`fuxo=>*{$ZTO>cnz6@)29 zsy5k+^-=d<{o(On{ek$uvu9)C>}X-+^q-y*HQBf=HiRDf`o92F&v>l5NEBWavFLmV zM~HZuKvTO`!g(ZGX6ghgNmhrpk2@dXghXpQHdssjH6qTZQ($--y;=-V-sx~{_p5CAPOhA=jS<`0M>O`?*xh&!(uWLEeY($_| zWJk@`9JW{oCAV$${+k$EjV*RLSFyA~%?eAj;$5qqXeu%&w{2Xk4(yP681fQZtagHy z#)oDkFSvWDR62+bBX;HoRZVdVNa>Zfodky|W%fOrZJ+tlv2tpvi%2K{?UV)1H?x&d zC(qmh^QQ&N6Lw|d6pHBb4Je0`5?gL#X4VPQCdsAmvNVm)#}W7}*-)2M<2AMVMb__k z)D)LY{;Vo%$&m-0Csb^1Md=m~e~~@=rVpg$w5%9~73;q|{3c_;NqiXW>sg-4>yDKHk|*FlJKfI?SJCR!b=S-3H|qQRPC z28pj{cW01}+a7pjf(~YFjpxE~3x1tAwALt0!+!fy4rn)#;h_0b#_5A&?D9h=O0MO$ z)i!-;Rf;!0h}jcIT@_Gx zAPXU$fKgzj2-F2^ZA95NAt;*mY0pd9T!3_4XGbUKd@1!dDO=)VWjdn3J6C(;U>lDu z)S297|n86o8%<xO|C}Kz<4Afs8F%KR+ zFj8C0f_79qInW_e`8^ddJj^hPuH9&^7iCkMTum`12WAAk-UJ$>j7Bv@&t9O>N@bXU zH0un+0@}}?z&~NAvMzJC)@$dFzUfyWkw{HV_Cd&Q81uk9{XKCX8O%W=fZhPQ4XhF- z@7TunHTM)OkKp?g2sx5iD6Bb9C@BmCmCn4dos6LdjBsU)%*3n?YK3R}f{lujzlzd_ zVO)FCDvCu(H|op}L2q;$mLtBLLyrv!CE0GcVp>@JbIYR0fVuoY0u~y3JbR61Rp;9s zov3;XUYoV@&?Jj1X4xAHaZxzXw?AY9M)bh1{iMRXo_7-Aq6}L3PBOn`uRJNMCC1z^S zECIrVgxM>5;hx!N4ud(TKB4Ws?L`9g*isns%EjdVO-ObZ43{}6?>!uJ>w4W6KFH0S z-Bvc9iv0$g0-o4=Ici+EM0`PlY}>W*!-WdnsG@=+=_sO8B~+2d9)o>P7R;P-p3u*D z0K5tB5CchYDkmE(1*UM$t?B1?#ei=2u>}lf!`MK>xOd)5dOwXG z&E{NGsmE|k$t7;#S-p4V#|+A=j%-L=!hPKToxfP%@+49@4xB-!G7|AGU>5`DHAr%E zw|)9-2uGSiw0!;fuHAlh<5~OWttsijJ4kmz{20gNO3Pa&7CFj`c9#W)lUF%_9EPv!50%l4haR)?|WQ z>dRr|=guxvkBD;d^>lM+2-#CZ8VfFr`Pja4FZD`RypV_N z% zGSIk6MI%K+9FrhjzwE?ytr@o>^QE(T?bA2Q1>0w-ek@Maripat2hKm-+gq{GR?2tR z#YD+$CHcE06d#Gnt}^oUJ(Ti3u(q-HGLq?zWFC&IVzLCteeP?yXZ6R`O`b{YGAG#Y z?^I}@cf-`GQt&Cu>8FdjZR?=a;wi=_ZJAYEGY*fus&{roAK}m?e=W}~6L(&v(FRJh zA$TX%WwS+NCjP8hMbAf$^A=u?OGwQZnLq-CIb}DX&_R~u5WJhRz<09vgPC0 zWvO@4GgSrM?!e)?fI1*TY^j%;TJ})uczMpJ?TT9ucvd2R&=v&va@d~)brgY9D5BE# z(`A*kE7YZ{b%N4pRE9Nob07E)mEz7C;W)Wp^CHwMJ1-9ZL}e5M{U*)@PNW%e8ZW=J z&YTC`+VbOaP!B1XhFJ`HaruaHz(kT`B z!(y}@!UMZs1$w%F&Cz8p0X-jGfCf%1R9I;IZj&|X_()}ypaSJ=4~GTf8w_vYoBTak zq|Ic(rqp=oB05Y}L#6hr*-{{WLucaj*bW=Yr;Xw4geKK|GR0a(N^F%jgdL!H!af+( z5-0>wOpT>2F~08sTTZwc05s9={Vtt_n&0T5Pzb~#WoOA%Zue!2^cj@)9r)CuDiem| z(pq|{R94Xf5UPF1G!^F^=F~qp)q9#&XGpVxaSx+dy;!uv zMX!H0!PZ=rfnQ83(R;1aScjW*%1ixPY5f*yJCbhF%2O2WSVu`|gNscS{K~EV_DJN3 zz7;ba#u}JpCpLYw9L5slmRTvttPc{{Ffq&N`h+hsZ@Q`e1pU~{}x@KS%MJ$Sb$=ocG&3IaYRS# z1Y`#2CZ=;$9_0FBmbG}%hVp{(H%G<@B*nAM-hzMsJy%v@T^R_1yZZQsdwk|v{zN~) zIxE404{rt85(9*qLc;O9as$=w5>VNsVI$2G-Y@$GFvV+rw}n-@a@3l}i1vz`{&4OK z1TxsD-*9{jj{nwK;6Z@yJ<|x4#{KaGC;EA=*ibjGr~3ZMLO&H`tn2r@;>akFZQ~$7 z3=fCVk}dFxTULwca15&>dz$UFxMovn%*2HBRe)f-tIWNljBhu9N;8ftPXld$hAuF0 zat}W$nj z_;mh|R((G)1MTTxhWvY9w$5nJKmySf6N{;<YXpK$ zwCnU067awM>SbAvr~n z+_c=;2B_l{b!Z-`(T1WtdJf@699M){Pxnz*7J4<(H`f-UlJyw;KZ6ciaNp7%R z>TNC_j)a2x1tr9P;r>^()S!O_gp;(Sn2!3}Y^VML5P^p6AORhuIc6`g7{s>t{bl{< z1#0_rhobZg)ND)6V;N$>a7@)BwLNZ{L~<3cI4!K_x$yixb{6=aX+h*7zP~49tu4c( zox6@@1zXKCv<$%tAcv<?J4H1|TgPA~?E~4n6MNohIz`04|jcE_YdG$;gMD+l!T%A?_0f>QZRpS`q2b%fNy!hBW-k2i1{f)VlZLC zN9hog0v$%J0Q$x<>iYK9W*@4fOg5PBM6+N;)>jJPfbApEIIgnKaRLkiP~2gG)JkW` z{zpwQ{Qw0j`!m9Y z5jm60rRG8aayUB4k6kP=$Sg7e@@2+>wZIatfqgDj$N1G1uu+pmfj_+IBqLsmlZ@-; z0to}wrU_kPL(gW(-)=wh)O$hcG* zRA+2$IXngw{t5wppqP>B1EDE_qBH85F7-_Z&F}^-XvxvrT5~)$KvNbCcBr27+y1}W zk>9{zk~Vy4}V4;}h)&HCNv!+l60GVS)O^&*X231%DpR`iBbA*Php92c3Z%2?<~ zdEn*MYW1XpzgBdCLz>9!eB65nwJ9K1D2oQ-uO-Gn*cZbI3jy0x>8W97x;Qw$+f#@c z2!x6)GU`OvJwu%F{cvv~>$zh!!XI#R>LMg+=#V{gng~|xQfis#Ws)2&Kw7QF68MR& zWth50vv0u{E_&c^4tp=CVrvE;Hl4O(r)6bU@dACps9n6Q)m=(U0NgsNGmp=w7ED>B zdU=g(ZF`ZL6pYyNF1+x4vaWA>y3?j}^80yqgU-ApICsWRG$#3{FX`l>xwlXAh%>Z<=|ft`<>IG%KGXh zGg5rXtQh{MRB)z*k%1D*gT014k>#?K+GmEG4tlcb_`Mdk1|bK^ZN3sLV zKv`;ivi9^=QnWq}P1zZ?60On2p zX9qX6L$=EW+G&iY)Mq8xm&zO~s-{Q4@bTV!WKAjDmEbHYgYjTkdX3;*d2m)>u$>C8 zvt&}}^aJaOnPOfrRU8B1wHD<0%O0|b-+-K)N&T9bIAUPuyaUd%L6UPPI8WV9!b1@i zkPLkrGvu8!#(d3gt*a5dj{eQ~?fbL#0KV+Yo0b&3EeAcnH$WaGkg(aLY0B#*kj4Z| z$+0kptc~lg3b0gG-s;fT8eWyhUyItoYDcWt>4jr#ZtGs&LsVLWvmQL(f+xi99PiQa z{j^!WWYEDlA(A%*Gs-Te+p5<1MjBxtJZO)Gchl%Fy5an`vf1+I(05CU*;~6r*k5-f z_n8X&$!nZ-i@Gs}Q!`TS75c`fYtrdqMT<8s9>Ap=0(lj%{r)6k56GGB!214%1VZ%$ z2cmpLkmfLZsY5kVec||NPC*EXBoisJvgIvfqTHp{(nyt+X{Jh!Xj-I>fby~6p4~Le z^om`eaHunz3Ky#mCR%uAmUg_!yoLJT;#B!s8GtxULPTCfz}~kWu@4>*ao7o6;?jF^ z4-n4k(f~<0o&fGhM~_M8@p*6`&MEQi+qLJ63(Ks9agCuS5L5tV1#)-cCI0ZTXz*C~GwIxBCc zdPe_d)_Zauy}Qo#UYtXNJUJ>K@?!e=NcG(wfm8j6J)yY$m%{iwi4R-g&mY>gbJvlS zrQ(c=B0(Ymt?%*9C({#_hOf%~XT(#Yrf$8twr&*{u4&vcBbSrhrQA2%RT5-ffshIq-2Ny7RKmE7`DL>~e!;YjI@?nv&553p# z92CInb@={H=ae4L`m*tPSc@X-U$b)oQd?&t96E%I?RmJ+fg=G^UA}3yp|Z z8pHthFEC*^2sVc(7bb=sfvS)A!+G36Ai^AezxxFVjA87LIT^r$A6rH;h?TnKbG3sE zYLwXj1p@Ey|n^9wPg!Oc<^?EW3s)izu74k;iAX^{2CNOU4F>pZ$O zpqCKfY7=UhkoPDQRk?eXdtPHihugCCJrdxhdnUD*gf46;G(704hEy^rItTnSlB0aK zg!c&zLF0?PH5nw#Ud^yPqxhuPk5)qKh&#IHxnpxDD6WX))KHNPz5Jr#9O}5+kZg$l zA>7m=7PqL}D{@&btj`I-yfE&HLf!|Kr9%Uouw}jvaCO&>aagJ5uR$>2=?CsC3IPs( z9fkNu(b06&hK{)sf3ZNO)230dPF4(Ds#;RQEXAbVA6q(z6%DP`;}NrnZ3Tx1h#H^1 zaKKL|KTTO)5^t|#%(7obYwE5{d*B2tl5_WIu53#znrN;=Hh-bQVcF05l%f{#phhp| z{alBr5y0P#oLs(~QfNhB?0?*{ZN42C)KtnNZXC^nw?b7%aywPB8aA|u2@d#dzuY|J zNTh9-Db8}AxCExEZagS;0`-&}C4L5da)kj(-b_cQ zn9CSR#V|G?qVN>_BCV~L=BRUUO_j5OnuOfgymFREiutU7uIQqHG~ehRdn)xPB3ce; zd5&`%`6%}4`}+0bTW-h9GD0kx@T8OApf3e5H^IHxrZHmlp1;+f2UvwMB&nbJJ1qG_ zHwV>8>-3MM0|JATs#!E;LiSh?j>aUF5dGO+FxA#5dl2@$SDzszSq%x|6Oeb@>}UslhG=fd+Di4!luB)?tR>SHnE9v-RwOE?Tj9?FX)z zIaO;&pJ6!B<&>QKJ`7?lLg{vU5r-%U$Ul!By_Vzsv+D6So|&!{HNef5aJ#D^3al0j zUH&0CLom^|ilnW+_xrG1Uj&B!S z;fk$1XxW>914nm=rW76oB)VUD11=r;M|QJgefyD$>WL5O;fD(Frl@hDLI+lZ%P%Yo zK|%^`O3Ak73}!N{ppOfYpeU{Z{G>!!gXv~2z?kDSp!I{jvsI=|??2=$uRMoguKu09 zg2SN-CI!_DZyb%{F(XCp8K}F?MHG55BY{3QI5M8u^;+#WS@_bapvB3=z_8+Gm9pRE z)-X|4zp+}0O?Sb5T|3Wz0SmrOXK%>fzcTU%*^*avxE13y!G}81H zgn)I{naxW+lSPj;eX$7-d)_}1SPX63Ri+bZgx{;K)8djVw7zFo;S(&GSdHLDfN_R; z4zI#DJG$dDfY-Y14Pi7I-Nj|%uGWoPt|NaIenoBqwG(-2EG|178ap5($Q;k)QR7)^ z261ZhLJo{j^NU#1o!>Vd%CkT8vBxRf+h8GWw{?3nDVwv&cLncdG$j zQ`ytE@Xn?~>zZ?PujpL!a%bGQQaf%+rz{5+#q|BoJGcHyU)(LXNwpD{o@=5^k2F>@ z3n|i2!Pf(s4Az$OrAb0Hcl=HqEs`3&k{Bw^Wz3FN#~XnN#^6gKs%%bKZS0 zAO}a(+99vGbpM>wW-){OO-RJtEo_L@#7q@K37+{iJsLCxn+Zm%6*l^U1@@EJV>O>aG*+ztzToJtOZ+YjDhTWhO#>@Y6X zyYXtUwqh_AVc{gfSQs1m#Q%tT6aB=c)Vl3pK z?id`!Kq^~puggffVK>|B&QD0 zCZdXsb@fZ{E}9<7AjT}E!$wfKq9eLq1Fa(uYpK4cRTOJgO_R`%d7J}fvj2O}%cIN* zwgNZ4WD(kQMgYTzZmdZduO<+MrjQ%>tUJiimr-QzzCd)%@uJ>&CvfBC6i%ts-;$BXfKeHE)N9k+fDl>@42r$tVliMdv&c{Kp$Vjl;$6&XW#}JV-Mr=&;8x$e~tQ1ybM$W9E)bF46dd*G!LlLz|78W+0XN4qy!hx+t+kiF!f~0e!U1$ zf+yqr2Uv98AXGmXF5_XJ;`ElJAx0eS5{q;L$WFkf@#Qrq0fmcILn1lo6f}=S{Z7K6A1kV5)N3N^on^)!P#nRi9qA za}%;U%dLrNaw;lOK{nf{qr`;kzxOX~{$0=kAOccqw~XbEHc;`~BSsDMG)tFHc>M2- z^nB?PZ=nWyS553XB3Hgxk*<1nRfOKDBY@G3sL?BzB(g*U)qD5FWAU?eNl~hAtudU@ zkErH%tg=RVsy-vF>rv@4Iu6e#%b9j&mpB`mNrq#G;&u98uLsxR=r9FN%j1*3?tf|u z51Qv&=CQSf?R-R%bN{w*(W+^$;em6a$tle`iYMh-oo14`@H1EC@S%9spQ9&Zz2*3I zH)S6)#gRrK!AT`Y)L~Ce8*11aGJ5sYiya%~L=;oJ$0{a<-nmDWSPgsT0#5nMb5WHj zlEv-1H`e++AqR~LGFl|bxE!C)iX&^i2bz;;BBY>n%xs`zfw?k(6A#l{-LnBthR0>}#<5Rq>b~W9)?}Q5U*Rst(Czw{zA6$7&vUH>wPHvxT#lrG=ka`OkTCLyXyO1AsY0 z4u7C*jb{Vld?9XtZs@U>=BCt+4y39d05)X0%VAw(Y-=0&pWwBO@L&TNIKeG4fOH5n zK4x&0QD@;ytACEzX9Ph)6CT1As43kN#Tf8X7z`R3@O~d&r)i#@Od(EV-Nz?#jgVow zG^_5syJtPnh54LcSboqdL@)}kBY>df{pa8mm=wJ+kEr@BZUACo*`|KpV*%yuSUKo? z_d$Gj9}_(xrRPABgSoa9*;W+0gC>4?ChkN)jD6~x<4vhn5KYXVQvy>m8CNsCxSdOb zjv1kbIH)dI7U;~_lWjv*8umNkbV6}wLvra3aK#!SnL1KthtVqRL>Sg2W`}e#n@oS6 zqZ&?gW#~dEdTaLa`Bc71KIuN;vds5qj9Bpdkv~HM-!I^O^xeRQ`f`1;{Jvmge)My2XE8p*8*>JQY=^F=l zCSl2y7(!Fkf=Md{)mX`lb4{v6!7l6+3;5EZmq&33X{Q1UQyB(r(H;As zL1F<#N!!;z=5<@iCULz%E?}YpRGWCF1;NAE8TQYPFGm_qKzgj?z+e?@9jz*InShCp zM^MZ`gaHd`-%Wo9n5&Ah*8spo>ePgkRO*1v=!`g>kY*|%6i}|)uzQ@%4%8`I+cAD1 z%uO4ict#?@(A4fQrB``PRf}fr3IGI!W#;kp-Rzy|B&#I^`^dsf(zyQg=QZ}~8`%HPnYi#lvNd?Gr27Wepo=QS7oOt&k8%+5 zh>g)A3;PH5-9-ayYntI-+dE}xYFsv7yHvFWD^1AzATYL?!ow1Tw`mi(=0THyfCbu9 zfEIkwkmXhtn(93bV~m)81;ST3Sg}Tl_Mr_d$*vUBWW^}`pjCV4bm8JbioHRBLoeI} ziRVl88)QES8Wde*hU1Xfy~Hc5`Z13t@$CCExf5`!%{*%?FncPk@E$$*nN4rF;2OA! zU#UZ!Zep6@?Th~aNNLSxbjwk-vDFFW9-$_BYu(B?B=FTr5Gbn#255i;r=ar!5eJH+ zh{Oui0J%Yvq03`I@HfYGAQ|Wu|L&nlYEd0u>`)x4VB-!lnCjU>7HVW}71b%T#Lplj zG|Lt%@s{a?CslX>LNH+)VonzaK6mDrwWzURe#`pV41%OH=z@g70)FJ zKqm8&7u0q)0MeAe0!$Z`pTt_^^-(t`NI53jvP0j?Q^pBb@CjjU{ZeFQ|82Q7V4l~&&*&12U^!rPIjG|GJIX)M*$t63=mf5 zC6YfmDmtt~X6epF_eeVG*jdiC^!GaPw5)sU#15U1oS7eI@c8(fLkQZd^QTwqL%{U# z;hr7PWmADBc=LLa?v?D`j~7GX>h}J6-TYr~qI33h?rGN&1YxyHl|u*5FfJ<#SKKxj zjn7DN!F_9wuvuDptPt}wnf+xe*{K$ZMF!hLh~HNli6wEa#S;eFiHQsoW2QLn?d>e0 zZ2EXZe-Fi2=)XaNV^I1<1ASo0n=C{Q7r&(OsOQvYKB_w>8q z<`yE+VN2?TAGELuRDjMJKq~ zEeukBNp*~l3M|oW3wwFCl4is&5YJtpD_wxXo1|!nw%=DhR%-UGzT8fzFingqYjAcR ze`a+>ax!5OcC>hrKof#>amQD54bY8JT`7}D3P0}4#+t#HyjU(zlm5X?y5dEl(HPvV zcD_i@kZ%n>EDpfl+Zib}4<@RjntK9L#1pfT+FUG($<%i3-A8N~PNiDSa4J$Qg48ye zoz>{>0%D_*o}pTY+B@%r_=1w`W2R4S1u(8uNdzCB8%L6b#X?x1@?z8|OYRq?Q!p$- z2{LFsiQpojzEva>$`5}=lpfTjmp|jhXGI|H-*QcnOFlnVHs5AVm~b^!JD z%)>PS291Nk?}CwB>obX0<{pum3+m){0uc&xY!|;qnz=s#le+Ol$-KD$sQuWJo%*xd z=G(2`pQ5XBvjDGboR`YX5U_ref>9V2RUZr=8#El^>=#jeK;g!-_@5E*0Jad0~x2naU5A(NA*z{UaFNvHw7h+el)Rbt|{ zB@Af0YuepwQdt2tC@nSFuIyW3U~t(VReQx0|3#n+ zES8yCMyJQm&CohdHftSfL>v^@)HYO;;9A&`?BH@g|A0`w*$G^YAxCSNVmEQ$h9R08 z3wirEbz#&2+9K}rtY=wjS!c$wspaL7+hCe!9+kQAYJ=Y{fNi%8D4IRn3r;auhiArB z2Nf8DL>oOTtW#$A5uBzT!8iq|2R=Q=P90%FP`ZZ7UsdWc1121$XJmcxO%($vBz2$h z%9gVW8c}Anb=@r|CdelG{Lf-QX!WnVLrCuXpsxG-`@5a=QXx%_uyC-r4U0~gN$1Y_ zT4EFi(F_dWq_hR5FPV4x>N;v#tt|T3+Ff=Dd?*447~RQLpbY;!lO;YJAXSQa(4uNO zSTyM;F7j9JC)2boy}7Tz-Lb*UC) zt9-n{bYi#Xn{xN7F`uFJ)`V|E*>1xDBq|mlU>Ww5k0Z*Q9-o(+tLv^NbYzdCo zD2|~;SmYJLYr(<-D{hpss4|r9xXQ7;*^|Rfs7Zw=GLhGLBarMzy}p|g2mwqSLCrEv zQNW`1{a-{TTwp0%H&z=9E3wvmTLT?a)QAwv^in`w+ zK{P`&&ar1hY(&`|6Y6VbgU9`aMGCA!s0k=}&z?yQ6}|gHF`59*%z|D#0ru%^?g}k% zeAoKdDfG(BwvmpNGiyFKAy;-#MGwV|TAMpm62x*Wqoy)2=UPOlF+)EG?07?;xtsU8 zf=@;AWD@)2o~myS&cJnU#@20p1UP4oc(a_@V;YX-IF(2 z4W-6~6Mea?ovjg1-$2ue8u}lt08UI>Xx5Q8prMD|Ao!$?V>WTSctEsy-ADE_Jn~XR zu3ok7CchhVc%P_%T?_%$>F;c$(+%{~1KhS~cEZ^+b=jIryoq-xtCL%SZI#mWm|tP< zAvbXd?*|@)cbJ%S^*o}sn&Zt1KN)Fiisj#$KXNOxnH+8xVwR&h?$F3dgB4ya?SJ`d zJelv-bC%`{y}pu9mu@B$`LpGXyf)xJ^M?R(hx+@&1Ak(f0TUUiPqz>+JFwiX!8pV2 z!*V!$D5HN60%9%$cf|Jq3|`%2lH|kak0B+s+_D4zxl`?eB2XD`MS$(ro*^DtQBMzcQTFLY^tJR1!L+8ZRt<2hq!=k zYT)UbnZj{IFt;HDi=&>jI1cx&4!x8dajS3TmiQYuV!iuQYVY>AazDJ7h%8HO^ju#k zs?OL%P;-4j47dS5K;b-Hg7`%4?N`}z7lUwqNu`uPaD^YE^UDUX8x z>;n1NPI03|+#Wuo{ddR$);eWb|D(ED+3n)$`dR}e!NL<4O; za@E#U`KpX(i*B+u70$j`Cik$LeBAN*{rmZ!ZR!7HVPsi0c2Ry=uJFGm-+w@_|2N6> zn=7|-HnVrK{ZAa`T0`4@lMUtngv&kXoeHetl#R6+cuOKTF~>_Kx~~U*(}+bjMJW}D zD#-S0{=1wdBvZfPv@<&aX^=S4ypOk8@`)b3x=qgXH?8~QNd)?fWNI@CPx=bU5~&t> zGEEeOtdS5hNc0@6vS7RN&CJ?%GA%SJBa$RWVh2i(VWt@)B@;&F;TnvNESYNCB#AY0 zo}pm7Qzs|B-d=3v`pHSzq7oL8lPFQKD%M2EE4wX>5scRx!};9=$t>u!#%YT|8GiBHZ(wSRw}Vis%_A`P$i?*t)T$w~^QD^||z=x8>KXjosbeY(0akZMkVA z82j%6i#+CIOi|`o-t&AnZV)sm61=)PM@lm&(9#+cUhPbVDd4I~C70NTFU>i-mzxE7 zVOqyLV!;+NV^UU>kzkA<$f#WtBi?P;!DpbpoZ}7+t2kQVYJ#!GC(-el*NA{EoHHBs8ctj@rwZR-y69h$b0aP)e5!AS( z8!N|!2~vzr)P{Z~leuC5*R&}g=B%;>{KbM+GxlW;yx~AupE%1iC%(c-eygPlWoOGj zAQ)N1Rns{%ehAtcNmqLYPI~$mPXLNs=6J4X<1bYwHa5lgb8cI9uE0=8*4HFK3o~t{ z4VVD2qZUqCI?obhFOd)TG=j}~G^w$rH!ASxz8{@Ylt-0yYNk5L36X3XOibr0ih_Iv zk;UU4dT-7u-dvnKf5|`IA0HomZLhg6-vzuBgg3}s^JBo;BcIwh+_Dem;SL?w?@~B6 zZJam4fR_=O<-u-;MmlRKrNPC2c&oq^$*T424NmsPs2wU@B6n+H`hztzj^R)}{1B{g zbPmYV|2yqD%jAqu~qJ-gQV`{4|}8QVG|pYu1Er4Zc@P%DtHbVhSoi9AbwS zboz!M{gw~~#D*q20``amKuwcS6Tt1CWV}n(Bn&>LIe-l3tj7u;U%}IA+{V^&2PsTb zYzLrPmEhQvb0>2@1Rf65(ekyS;jo`&+v3(e6c(Bo1 z#c0mDosxr}*wY!LZa0H9%eGC{k;KuM8>qOwW#H0ipIDPUn5j}XvOh~s_S@a%nMvh- zsih1y%uZ|~*~i@CQ;Vs(i{CN6P4NI$d6cw|MX!_dVLJOc2gX+VO)oD%STuc}rKibn z(Cq&4FuF;wvM&{c+eT}U6-n&aque`S6zMa#da7;Eop2@XI9-t{5+}#>sskIiKdY~4 z3``vyLJJI4VN`xZ!K(v&|2Td9SL$?rv28UFJ7c&0e5`}(4!1G+E_eS5WOuK$j(4ja zwQZP*8g$wxfAz$Sw22>KHS^p?Rf^Pm4b*-`a6s#m#~ngw(S~7B1+@})<5q1eskR@p zpJwl&vb&oPK?5HTkKr`>fC?*a`=>JPWGW?WYzUAOD8PSlEoVb%87jD(g%32|iWeQE zedGC-ADp4>S@JgaE0M!kD}Ts3G3eb}16CX9z{@+M_zD>l2p*4wCi0tsYNXFPmsiWe zmzJ}@g@XdLkDp@P@>vXJ@=*>a9}n&j>la(5H`a~*Nzi3&@z_)~ip26ln z)^n~8`%eEqUF1J`*x$wh0Q8^7?}hby|KBcRV(4P%Z2JGo#RC6V7V|%8ovm_u(8csOR%>GF@!N?~HEitO_13y3GG|GV*J`R#9_NjZO^;2RJxl6<>+W2bC3`?-{{AWzNlYj=D5-i~>JD;u>fl-6R42h2!tbY7p32s;dU2|Vr|oc`ULbf6V_8+ zW0Peh_d`2PNX>sucr05OaMx>}=QJ1HA%jfzJJtq1C`c5N4o{OGMH!t@&V=H$Eb ztmisZd2F8j*&wyl2s6Fj+wEoT|0Jw{_v;Fq{`KBS*f zHX?8-Rh+{2m#CYa!{D)k7B6Xj6hcorLk(e5x^Gx)Q)_A&A(1s;|492fPVya+kutYp z5Rcsr!ENOF1=uFWk{;B4)LS=4#ou3kzNA5Hp#VGD3>KdPHPIi6R#= zznV?hkquu3Iz39YIoIK+R(Jwn#~$|$5cm}b{%iq4d;-JEuK{z$YMcK9iA45G@yV%s zv?-sx7ee^%)_Zapa_*FWC7UFaT`^0BG$t{jxhXurKBjPhk(7dfi8L#S@aE_v3eX~e z=&Z(rYQk2OHWwi@zD6)On36(y0hs_PMc3m4go88%cL5^-VA=fPWI>eSBF6*R0kGu# zKt(k*Sgk|uFhNffX{K2+5H46A+)1A)b`$@nySzpEnO##Wy(0<+O-SBCELLG07Xsth zvlOJ)YK>_eu#$I}Hj)}naUb?DbAg*rN;s6Dz!&MA$_enO;Z_`muqcoZ zS=L7m8s8DeuqY_PAV6=2kxPN$yzd8_Le?<&acVYdpliVDA8mLh;iLTbcpt5**vgCM zr1?LJDBs!2Lz0MiMbtGxU7j5FUyvVDkcR;d>uU^$^X}2C_INCah~N-P1}PGp=|FcC zDay*jgQg=|j_obd6#pfA{&sdB>E(})TQ0d|iK*S;Y_ua}XY?NaG7{$70c7{#36dfL z4Gx?)B8FKv^I%8ukpFp#xMkZ(zl;Xlfb@H;+TdC(-rW(B6e&*mk_(&jYEH~mC`j^8 zMu^5P2}v4#AiR*PlZ%{#OgXtP4Qv#>L{NIm#&l!HT9BCaxgtImbNq>LC+^YT^zo(z zg9uWwS)iDgf!pZ1a*Fh!zGCC^Bq5_&5}c$08-PPc^vD53k$ra?1Wf&ZjdEn&iuzbD z5lBeL14mtPaCM3z8lGFmz@06Ba5O`0u;BYm67JJt_(m~YJpRGZOk%=YQw}3dS%w$JE*;Iy8EG7wwjR|-&|n_7kQZ6kfK%xS{&hGmeXFl z_TMoJc!4JBBE6{v_5hvoS*9UI&HZr5TQHV19u4|=+;+nxdZ4<<6>w*;+!VD&7w$(| znPn>%TEcAd8N=<8*97zMce)}DWrA`jwSgdbhF*>Vk_KyE7HHI4TO4y$GDP^ScXW#d z_4ABb9&ncLdFnzJJ!k8T3Wu8QgbTwbzHZ&XLCe0PC>X7Q^vNcf&~b$a>FI>pzXrxl z+rc#6yrG?)J49Z!>U(OYlk}0r2w3jvBe1tX1^_tg5W1m~*BC07BrTT~vo6h^=(_@D z?le%F(WY;Y&*+C-|pg2+mh zIl2d^@rBmT%08fM)@+|~t>=3U6L4zthkMxHUr%FGON(D)A&i*jZ;BkJiciUYZ#kc- z4*e{k+8TcY7Uo;k;?`Q{S^78KQbt>8`}+QJwP`4HZkBq4jo5r-q^OI03M=I$`G0qc z^-LvQf^KF14Suw!rcn%zDVP(2Ce~L*74JOWS<`JoyA*o5WzuPS#EKE6DiRf_eAqww zlxmryJ=#0{oI(rvXCkNC^fo8;(_AJxXxOfJ@1J{|`hk?J{?DkYj#Zn(PRjJJW-#BI z-}zi1y+Rqe(IDbYK=hS(g16o-#y^@ypx)Cqvf2?ojH;a7V1=bKW7hcM+I9W^k(vKl zN_q@}pxpfZCWLoy4<+9;xi^NSK@M86OZPRQRl6ddV7Q4Wq8dZO@b_I`F90Yq8yT%(_A_Zk+& z5=3v2p@TY8vb5?srcN{q7#Wj+=dAQB%!LP6rkt8~t>BbcDMaNof`#@2O6VvIR$?Jg zljy5gkCV|j=|mv3H$(20j=$axA5EY_1A0)>qQEF>nOK#20;eQ6vlwC0bwAfkwH6hy z5`KE;`d_KC3UEcr=%DDqPH%TlSJ$tPmyhG2@~8CZkEiXC3wVtLzZj#jR20WRGMF&y z>sSM@HYEd=Tn!4Z#X-Z3OZGoBCCYh2U0~zx)%h~y0{Y>I{LG{fgjXTY(BEmW5F`gN zBut{S(PD;E5;M~)Qk$-6?&R;td(dUEhS+1cnpTl8p*B!vvrJ|gV$N)5!%x;AN`R8u z&EMOoP{7<}9??(xAp!7?^BtI)Nae{e&JvoMwcuOdxFj0d=E(D6GX?xQ$_(C-`>2zn zBu90Gcyp)BYlRyUv5+<;G?H#3QnHzdPd+VytKQcq;hl{HjV;}~>|=CYQ}u0K9En&*+B^GMuGkE>{qa+k=9cjhe~pty_NmD%fSjBQF+Mq&{hprEP!xJYU^5MWgY|Fxp7xj|+__?$Pr_@~xVU z$jrTX>#E76p^C@xF6w6km1s6R$P)uJ_x-%MAF3&1FB> z^5GAyF|T-w;LHz~T)MyPo431=I6)e60j53ho=9D+eDCpqo7kNUp{qT1@Y`e zceLyqXg2G;2jlU(_Pk-M|8P8Ahz2^1)@*vVI4Bl`P0OyW?iBrq2SbH=%)eU#KAK#4 z?XFN}mp}#gs;^(+wN$e``4_?#i=r)=ILe3^x6{e%&|mq;H9UjsbYJA?^#1)DX2D{8 z$838vkHOq*N?>t^rGkpivTqB_mEVOYa_j(fK)1mOA#F`oSPn`~_+)(8v(U3| zK^p_$7Z!9|k+@p^bns{ZiCi;iXQzE}01L}W4Ha*aq=RYqp^Gt00$b#N^5gc7snlh0 z$N<12MK`KT%9Hhm8uY3tjVLl^a*@kLBuoC^&Eh{NND}@PonR0E0C8vl0PO!=6r2p5 zT}+)Uy-fd86Vznwe*wq;sRFj$js3M&ZXl)u-gs zSN-*YiXH?Zw4|U*=Y6b8`2uPbxH=lLod4k>S}ssj(lYcY%V6_CS#@kiy0OYJJq4pq`)Bs2#D_qbQsA;{*OA94>ls zl5bXC!>Px(Y~D@HEnPz+rY~;}!>Ki$0xwmH1U=a(`uEi(Sk`gvqD`TAny+cwU1h{= z_b|W~*wja65cQ%rentGIzf-dsy)%p&#@Fll9q`BHzJtCYwZ8K^Xr0(#RjA}$@OIjO zZy16RQ69}98di`nk(4!De3I;m&>|u}SWXb`5-{vvXF(H_hMKRQ*U|BEKQx}G?ob%K z=KkMW3}|ZR_C3@~E9(fjOfVLPKM2NJKU23OR-QIL-?|38r~n7CDNl%#+j+#ovSK|u zl>34BxvAo`G^`X>EG7z2j}BL>s2mR0Q8*@bbvD+k^2%$~NU-`_1d%YwWpR(EEqQZW z^W=0z_22aWGK>+y^afriU3V?Nw=Uo&B&61g`vZ|vEU#q$vaWr3>)^hP2Cz@W{k zk-6`Er7edGKq7_|Qq{Xn8Qbik7#X`>m`NvrWc3a_T2h%-H+Cj$*6#ly-8_P5lRsD1 zQ$BZ+ap^jAoY6MV3puicFupuGV!2Gehm;sUEUt%D&P+vdc>k+SxPe68F_C+T4BBMM zI(JUc9mqcyJQIMpLqV|E>&J5M4~_bwLA<`?Vu(~mlyeo^y@fN(Q0^9)TwXP6Yqlnj zC$B~+k}W1WSD~D5B11`nc9u`!#3MIX52Z8G+qENr)AN4VJ+0Cey@xsr?E!0_Od<$H zf|c-AN2`uTw~dbnY5*pL{=K-TC5`)ALJrKASYlwJO8ayb8Yr|@?ur62CbC+?U&x9N$AvGjqvHp97U*M4>$)zzXekUbaFxzdQg1ZyAbzjSnPY%@p8 z#L{%HC2+$D^IK6YoHFeG)bE4aA9qe@WMjjVWH~UM^eLcnnteu8Mq6{G;O|O|3x{0VTaM=)(9yPdy2tAP{FCkO0X zuP(8GKp&R<>;x8^PX+lx}<_84jrttYi|(R|@b zpNVBr=P@Kww<}3mLcF3sWtK5|YUr^#5O&zvTfI(^st`<|VsGuoE(KIatVqj-P2A{x z>!z#O;hKJmK9uzsW|tuja;h$~D8#*}_7A&iDaooTLrzuQIhWnER&-}z^4^4g@SoTW z)AmJ~-%^Udhz^hjTW>Fv*6JHe4209E)NRGVi1xyuO3s=gfIM3c_cnvC;b_@93R{Q4 zt_Ci=$SMA=zf{8Iz2XGb0SzqqrNQ<+Vxa!=V%VR^MJH2mfgGHLHnw-SV2}Qn0K-5jtoJP=ZcSoou_YWaU^}GLxO}(F7J&TnBT%8M#O_IH(J$i-@gPCPj zr`hoztyHbN{e(19PXv;@+8TY{&Ruvp)KzjqKKOAg0iZNt%Q}BO2%AiY8B*B+31@!+ z>IM}=?6OPG^WWqe^rH2>p-m~85H=4Ez;P%@9nd}TBzqYu4(=dHCPmVZDA@^gg+57m zv5y6-{OW}xBc9AYDPVVeoN7sWIG7chh3IBWDFy+i7jK4xx|ruIb>A=1f7=Hx5{q|i zp+DUlpmB6uYj!XM)JOx05Y~ls0Zpcgb=B(rkx3}=_=owmVES>z+2USMA&8Ml{S&0t z!|Z9s2L`g`?j4Z;>#0{S{~A2%FjyzJq7}%}JAID5#^^-@d&5aMRd}x&DrY25Fhr^D z661?ZFB2ZjXWIq?z+@VNinzjC$oNL?>WN-rToEv+OyQDX2&b-%6V?P6$u27bR&CY@g!ua><+Jzm1ihK z-jU*}orLYV(ZOgqlgu^(JmGgh620dA8Wut*YZr(_YIHZ?s~KqVgAc7Cc0eONy-!xe zYdLi?fq-#gN(>w%Dt!Uk*|@8K+dI5Cotb7B%IQEJ6mFhnCakHe*9}Lz;Yww!6*u9H#QE` zfu5kuv{rL1+7(!XY)9``S8TA4L)RXgO<(J!P={wn|70zR5I6}L!#)0CA;m76Wq2x# zeUyN=+*td~OH;(tEF>?&g-{va7{>@r5Cl#>!6ZY+-_daAWg39OQ%67AMvtzYB60koD5*pex1$_OgU9BufD#A>Z??e_whr@kcD*(;s|Aq)j~X>#Kx7yp zY(VJv!X0e&^(@xQ8(IBn52OHi9wB^v!NLn|BNa^aHNK8Q}t< z!NGpShp?s8Fc##R)y?c*-w90T$KXavG~^xVA$41bM2iyk!mUvwno?N{p3YujS*SEa z=tbmZa}67yN|%}SZpu4EnAUMv^cO@6mE0JVN7gYl+ly1@eZ$$XR2wRSxc%=P5e0Iu zadd)Mb);dvTCSGuSw6kN7i%k3o3O2MRcoQaNP1xzW_HX8-tWq0G+J?YF z7hDwfdj>y*Cp>8nEa1&ZK?N|BeRgJIPr8ql2cco@r`8van*s|9W^KqxOD~DfRG1mhQcP~Xa9LB~9kCRq?8x0%RruL@( z*2q<9CMM6OuS9`qCOmtatwAQ|5xLfyn?{Rn7c~v(#yZ~gnt#aOfPNXP zLZ4yTx8{va4r_@UH6TMX@Qsp3DZQSp&?dyD`LPOi7+3wo1@idkh94P z2JwZ9kP}~q?Yv0efCV?h37N8xm)U4iRphrK+!l#p+ibL_*0N7CS!f7$I-xbHRw; zy`@;r5f$FobWyqv9(x9M6wD}JV0col|GNL^KI664Sqbt2>h*Erk^s!H@5k*e_k94& zBY{>}OFwPNhPVD@mx9=oj@5^ShHUmKa07#_UjSFbh(XgXP6>UV0WY#inAF=nSi>+a zAZ*1a9S!X9yLdh0{~uH9KN`@Q4dHbIQSb*ZMT~3k!$kZbL(@eJLt0BXGYIeJ0vu9Z8Y>zjZ}|$ z?h{Mo^7%TqQ*o{{mD>)jmaRDht5sWforl_PRc+m-UwKXWdb-uCH5Plk)Ydp2??+G7 zr|R}Q`i%XP$I>fovU9DbQ|!J=eD!4kZC;tZ$2co+31<%uHXC`z*Ebn$xCPW3CjcQT zsMECBoyJH9oHQGcZU7v76^XQ`;-7p3AM>nFcgG1Y-kg=34{YE;e}@}S!*OpRt9_QK zpx(ocHE&UlqMeq;u3Xm#`c})0M~t7-u_~h1H;I)p)q%sxu=()t<2Hp3xR;O9px>ef z_D2A|v;2YXZ@l$Z+8G~geYpUtJ;yGJMV7bKcCeCyPWpIE(G%y1M0CJ@ehW zbR|pU=S_~Jy4L2D>M?EAE9$u!MwryjYK-zeREvhW4t|gWj_Z}L= z9T;cdZhW@q3#`*8XgiKhPhvp)lp!%Cf5Tv5P~#EEo| z(Bi}fJxz_X{s}ISSoE5BND954L&FhJAz23$&JdgVXp{W@*nXL@(h#t_U8W*>Vvm6~ zR&GiId9P!IEmhr{g%1l0h98%s#6jgTW{*_8Hu4+75C;k_F$i2In+tKK3WUVtT_Np$ z)g@QUdhpm|K^u5n&I*N}1hC~&?iBSg-@Mtk5%*bu`4(F^Opd9%^1eP7G59xMu)=ED zpFNm=vZ+`Xk@4ZECcMohDA`a*>#fZb)mAprN8D&cU`21k8wqNR!sXsJ*{o};ot9Jy z+f&^_@bV~y3jt6oo87jL;M%E#7Y_)I;xwUuF15uICrnFiBK_4*@>W0Xw4VO)AzQic zn6r%hMe30;S@^^0#+D?U@TcXBFv^B7b-c<<{La%N!Jq&{|7_z9gbEz+I~<$5SgkF* zKGF5l82y01tj`$SAvt`>5Reml(uJ=$7Zbn*!&AK7;5eRq|1SjfacGCcODFMefu;?r z5?G$;>TNP11S|Oldx!bi2_UXnv`8AQw38G*@$-;C5iMkY>XCSTHSIv8ASE8gk`M*K zgRowGFi&|hR!m|hQ9Vj%Wh)R;k@y0|@`@+ELhZ3IC4pE-hN8xNqfydsdOb^InU#pi zH3MW?Gr?{`cAtkN=cCFru0$%MdUBNzITmo~N=^b|J)#e+p9aSPcmj`cg&@WugSb6m z-z~)cUF)#SvE7{ZtVi!U_MV(i;>Ru`ZM%UtL9J%*Xzm;-?Vg2P09evG5{zrIazJdP z!?g2i!n|T+<0ugD&`)Ov{_ta*+geJ3KYal0h(x?G8}vV@$#ATPS*>|$3u7YL<_O1j z5O7FI-=)+bn-Y{12an=sqki_Qm@TtP;zh9G8;h^`lfc>Bi@0KLLBr?F^CqBM`B$$3=>W%nIj+(%{ zT}4@X-m%<%&|-|10(wh?Rg+Q5_5D203I;c>h8S!2%HWdVWi4qsa4IxK?>QSdgN<83 zX!+|kAH)r#Bn}>&LhC-UjVv3SrU|JMizw4H%!bORR82l?(c|}r4XFvFVW{^Tz{$Jo zPv{%c&>fGP)!u0*aj*s?h{4pbSXT@-&4f#{;^3jDxl3{?K5N2Rz?5a3Nr>%yh7dXg z$YZd8e33=h*I-CC%DYrI;}fc0CMT2~DTu)4q17~Blof$TH6bttdVX?VylK)>(3#4p zEC%qQ69F^sS9!tr(8w1e$^gdgG0^(t2*YT)UPDyyqEiCqj8O+!sw7a#V1nER1L-)( zxft^ht*{?`o;y5@Y$DkwsVGNd(udJSntUQSNQiW_`;uTfzxDSghEaw-(9@hSApuE& zE0?$g$O&vG?tP&AIGb@HRwBTISXkf^k=LEYQ#6+x(lYqtAQo*#Z-k0v1!~SHhVd3c z5OB4R+Djnq?Ff?a!V%UrJJMmIn;f%h!i_846HRqS^D0xHP&Uig=QplxsPa}bFAzhO z&nZN9W31$3 zf*(+<^C;!D;0Pugs)(lfc-|#9WzD|Z=PpUW6?`Uo08PauRyXzH#uiFkK2e~fHI>!i z7X=>LU?5dAsbmz=ron|oF?2nq>3k)bNJdLIaI(3uS5}{Z5#u&dAhL$OGuQS{EAKn( zE31vIeQs=be7Aql32d>wr*a|T!nnP4em7LAhrJ4WD`3-B(%#&@<;=Fjc#vgu3HIAA z54l4cO``rEy3VOR6kyA?v2EM7ZQHhO+qP}&c*nMr9ox38n{!|KqWcfjV|}Y;jX72m za+O>McZB4wxIGX%BrbVD#$e-4bfw6(>x+^C9;Y0H2=)B_e1wkf4vwM0F$m|C>K{fXG1JH% zgrXB)>W7~OTMcR8RUZ^eT$Os@^{e<|yxiX+XcF7%zB@ZY0iQ)BA#D#-C=2v6Z5sqy z8~K@S0!XXEj^tbd=UN|ssD^!t=D2sVa8L{I@8I8(Ihb0s$ zIKSjLQ|=94(XHb`EMRmwP|y}*x$L5(8ufIme&U+jyZc2xCXo=UQDKdk9r-N)UtW^% z+9*={;>BmyPpln;2Nm<`ALgf34|H3UvZySM&6v3LGV?@F)+~kN7dR=LDfWbF9O(lhP_Plr{ZRb7WDJ)Ixm{yh+S^;Xl!BsfuGvY^LH?j41nLI8-|Dlk1wfNXX zBtly7?Gq2ItFl^gE{4Jn*zwtoecV~XNx0PVXrAlR$tY!2OmGASe1NHm*^`tIEX7(8 zMz;CDwP3wcVsRdxjLFsbiPH0K&JVOU>hRbM(Fp9?C_Fqx+4freu|-G$wW0gM3jV zC!9vYIe!N8s+cAn3DgU@A?(xkCR^?6cDlNI-ajVQ`M;00mUim$zuulyQ+yuEtJ&%8 z=Fv@Onh#uz6OgOEy{9}^R@T+}(yI~`Z3K_7tZMdlI_>_C-)tt00xt~%K zPq$K>*^IwiQlJ|La56r!% zJb475Hf3OM7BS~?PBr#fG7r@v8qo+mGA%F@xu=q&dd^JGTo50Wknn34i1a2@kN8Aw z7IiJpc_WaJL!kKxr%ksPgaCOmjEG@GjP@cJ3qqD}%=&54kN8D(UQMAVyyyz@_%*CF zPGyl*_bwr6jU=rd&oOivaKwWPi0>J42G5Wf_$A!}=Vv!)cexzEk+IgJ^F%i;B_0a3 zih&+pW_0{=t;d3&U9|I_(bn&KH&e_m)<&Lq`20|m>(H7VhPTg6=0O)Pk7$^hA5fUg zj{UtT->{Mnp>-^y7mI2=Y#%#`WK2+dC?xqc zF;$8CD$0oXuN%YTh<<+(?P^*B2MHyC3F_ca~ zVgS4dTJy0`xP0kB6JY?3iaa^CU&{sKgU6ti@ycg7f2U`rTVF^g6 zjM?N(V-DmQgLJ@im*iJ)Q#pFVyN&}O0m@NRqLf9PSfGsm@T78h1F`K7$g%a*tbhe9 z1%eEMzJOIkEYRAz{coSixqa0)aZ9!751`I`5CtyHLI-Q{DtU<2tp0ozjS5R&p1&T- zcxyC@-bA zJ{VpEv}nh*GyDNEF6H7)u1JTD&Q(p>O+1`?0$X<&;ix(arA?wF!J}3k!lv&|mKjua ze(byh#pkMJbO-`4MTgJk8#svr|LG+f8O=Y`UKmYIfk(;W@I9`?yNY9w&Kw(K-9Sdj zdV&OSvZegG2)ymYo{-4}11za@<}y2*@KSe78o|u3<>^4y94ZX48PB|j;jLvbLw82R zBhh`%)tP{*4^QdjwP$6 z72_$Fx0PvHlBh$J+8%mfbN5M2>x-ENHy8Dvq3!ElO^5Rw>rCFP#4D*o_dqM}YwbH4 zwMp(*y0PC6!-F*%Nrb~^HIfiQ$6KKWpg;DS2L*%ohCik@E)zHE*$`I4CSkQ>%DR$< z-(8Ha7eb!NrAyINVSm(&$5tQ1h3R-8x1%>Z^cPVw$g1hgtiuz_5G z+G}*fK_Y0|#A~~VY9zJ3f{On2T4ed&afWKO)^Ya6YpX<_weA7Bm&PwZ7R}l8rnZIG z0)Kr}QO)y{f;}ekRBT5m4b;GF0l@#Qbd#=Uly>3!Ow^R}VNAz-@DKq#%@@-nJq@&F zLT=CzziebY%POp_P$AHP46&284Wk%U(MMzI4(lD?0fXif3x${Q>Myvrkw-TX zZtDPC_CihhVxM57Cs*Ph>8LIIp($(>?n$jUEH5Ch4b$>$`8s1Dd#=|EibjVrM^9u=U-y>AeLVh z*OTcKbWe#CW7?|i!}HNoZEKZwl|Uxj`K0X7+@h)5?ozn1QxlYtNi(apa4!X4JkbA! zaHr#gIl(cu3knRY>r8F`q%zQtPUU&azE9`ICL6i}8i5YgqPyk>WC@!!%AKTHb9C9s)FWuJ;ap0+H(pC_AN#lG@I6P^H@isu3gXYfR)i*mROCpLY6VXthQHiGI2x_b6lT?dI^<-ZOsitV78;sPp8YyPfuug_=K_ig zm4v%nuUvPWX=&+U2yH>21}n3U8ZhppRveUi=p!tG@DC*WS-MuubiZ63v~fTkkhZ~usDRaU@Qt`F zpR&09Anup+xpPO=33cZK*NX8)s&;F_UYIZQCXhn*Vczf2(Kmj&kP@e0lfE`(C6reJ zd!#Zjp}q(I|4Q8cgOSm%dC9>4t_ES{{%`TZ|NZOw%Z&U2Bg5GKbsKEgKR{n_9bLd? zy(ev+TU>txo3`w%!+~twEp{A$lNO(xQ-;(>n zl87pmFk!`B?|c-l?xdcXX*1H_tx4+*U*c=()I>Xx13mM|3!$A*qt&fgrj4MY?|pf{ z-00dW_lr1EAkQQj4P*M4Sq?b&t6wJ6d5l-ln$zd&FHPI%3UEt1U$*(*3_sxK^>{s` z0IJnuX=dKFlx$9do5eV;x&pigsrJE|QAbm@0i9f-q_ht;C< zy3X*~v~T^|wO6{UM^;7YXiZn;^y{Q+y;^zv<^_Y7IByO?qXA<@K($1Y(n4SdfzT_V zj%#!5dpQ!TXn5^InX)cVSZJs+_-K@9LwKrR>EcRh=W8qtp@dRn@fS|_p@c*IgI|^@ zUw$xIYNJzJTOgL^*c0e_zX{B-v+{57g-3Tl;jJj1E)wn1LKEy9t!6<l#>^;fwN=>dX|L#AwFpkwcREoQe&9qCVqqe00NC@)JZ+U>qoQZWA$ zRmaS?eUD)jeaE$h6X1S0P~qHpePRKI{K}^t8ZV+w=1gUGC@c zy|tjuPlrdB_AOy443tH(ioJ>2=O?Nt(n&HARk94$E-Q(sz?fiK7u9uVRH2MWfeaJH z)8FLU_w0yiKjE=Mp>n|NSYSsC$YWCo==?DUje7LE?N~U1#n}0r`VGzmG?qv5fJUmt zk+Q2szQ;DYrv|s{??0a@KXRB3YI>*NgBAL}l@YTxBQs`&~ywSv}Sf#(n0T4&9)e`(G}rFv!qr0oA{ z1SE%DU1uB6_C;Qc`gQ$$dbqoEeOyJ3qdzDEOezCxQv&>;0N{$r0h;>wB7xBm1$}be zE3S%$^)vw?JB}%HHTD@6YA9aP4+XyLy?wp1_~YI1ac}ebe?6{2rp@$A<%Vg|*zG&@ z$|71d^r34#wp2YEz3&3rq;rJiCU>3S8`=vPM6=xse(p}z;rYKkmEd253wLp`K7oeHN1 zlDT)#xIs4fk`G8_!`|z~1Cc+*?#G7(5|@tqr(e`j$=fmTuv~{htga8CaLLV8=EDlG za}ykLdqPmrB&aYy1+QyweCOE59*j{E!+&S-4}fWH5cIY%5T8un@+y3A$A^gLEnx33 zy;I&mmOP#I<*DU+ZZS?Nt{EeXVgBnv^%BwATz#)@1VSqxiJhd? zMJK}in=TM}ssh@od8*{mgoM*@=DxLIA)i)_T>7Cm{p<&U#Z-oQ93>*P&9bGImNAEU zr^f(USBV{+t>Gy(z6mD}To=Dt7KUzl&k!wL&z>aRt;)c%#L_`wh%OwFf)(N2Pk))+ zRat{@I5rt9I-Rh67{W1(0O)6j^2{2M?2}H9ja)(HrPhbm=#5Z~JaK42Om9WbE+ok! zZZkmto{tbmLR>}M`utJpz5P5>IO`_h7zt`J(g@8MjM!RnsyD3Uo5*4B5U78a{L5j~D`8PV#1B zT&TD{CLQtfom8qa2IuwRedn&jO;okjyi^f|2l<&gA$OXH>e8{SDyqPqii)ukhAZwg z)KM=Lw~Rih_2lbRVKh}0RI)SnYkb?cWeDO+Ag~}Cea1T_!-5DV6(9c9MZ!WQDlJuO zERo!6HbPQPD&u0s(;-AbC<@X!kit{NoWN?Aa{(injGa~aX?Ik-5eZ*oD|dvz*?{C> z;v+#FsTe+9CIxMzKqS`1}FgOLpgl%g?@r0}Mg;UY1yN&SnbOF>}4=zK)r1#Pmh zEz|1E8%P~PZ?52y5*kS64{mO zc_RxV=oU)#E0ukRvhc|Gd>ZYXzNTH`*AA;6If>cVaN6RS(`YRAt|<429oRSj!MBOkmYv0wLVA8@15jH8Ap)8o98 zNC=?vT<4^(JDJmmcvZOKs?r|CiGSM8r+a9&txkW0F;!f?V7d>J#k11kR%C8}RRRfBoTIBL; zB*Ma*#yK?E*Lx>y({$0YKNvaMAIqeOI* zuQSz}SK(fTxcKgv^UfU(fag~nfePxnLpM2Wpz5oayCQu(&Z&G^39lA7HY&VRr>&9b zm(nOK)Bg%I%n`#zb1i61Zb%SsG+=t>F$q`h8ax*uf~HAk_87+DJd$b9E)&QNP>l$V zq^O?+g=;f|1F`&Qv_tmWE*Y|bVxkvb<#eJek;#N|b<%D`7{P!FVi1>iM6+ zA3DT@XSzh$RXylZ)N~ni3C^G6EFu4`=`!3Cd7p`niDEsRZy;&2eAxAAmIDbyUd>b$ z`J$J!yy@WrZ_w2t6Io~CS*Wm;qzQ|-YnjSHS~?u5>%N5E)*I=qD9Jz{l6dU?Qy{>J z$n409VK76Ws{4&>d6VUrhl@jU`Ob8n=zSOZINuJk=3l@s9b@^BT7(m|Ha$Em$Uo+Q zTp9KVYP|nUj?Faw1Ij(JrY2d4fO@P7EX9yh)EM_HB^eQ=l2x>M35qJlneg^2-&Ww2 zXo+REgfDENKWj_0MmYM2tJ;sXDJ^A)uPii4ON}j7J*dbXhfQl-vM<$U z7*;u^HpXHLXVmFpb#K!K&hrHVRq`B^mR$oa6e?e2@c^c8RRwdC-CnBwj z-$S%!ho6}Bpd~*qVdp~}ONiLsh}s#BRe60tPTRLds?J)63qO=TeF5rOx?YEY>gT@} zEVnjqSAJj+!!NPMt5R<0{$5;?a(plv@8yKmF&#tz9YNC>z>6wGq>gK1dZ@ZiuDdSp z>8!0Q{sw5n(2OqU2f!uY!05#8-twPxr>b#Xjb9O z=u0N5ch_ZQPnAr|QU|zv5KuYIr~oZcIltYkl?6I=rM&Vmv4YPq^uO=$5G2Uk80R`V&m; zrAypr=jMpLiw83}De-qOn#PK%(O9WOOI^c7oVnCUX*W zP~zLBL(Xa}wS*R`43hJ}M11?}4nrpd204#V@T6}Ry?nwCSWj_e2I7dV4Q%=3v=e3F)2y5 z>ETdT7~Y!Ka|2DC-5IUrtp!B269=ZoK$K=w4jny|8$oQ5&NaMkD)}rm0#_Xr6)CT| z14R-md=U+nI}fS)%FOK25on+|)6xXgC_cv50JdKJp=amBSWU%hoX*}nR9fcH)A!eJJTAv8d67Z{-w>DX3gFYVO{=`qKlUB zWJ^-DGm(W+`c7tK&{Y;D9$mEDiRphWF{WI|dPJdj?xn8WDj4+K%XQ|g(w!`@NfWKW zvNJUsv(P}P)eiA;Aj_^zqG80sD|}J)`R17H0aY`eA7C%jT|PwF%e};5l}jq6;o`LW z5xy4@{^5pz;T=9J;$feFfk7p4uy+u-!%yX^3^a=4DiDli!rMHKWd!0~r*Rz~*Aqiu zf&f_1k7=6K1(76H*TZljA!Gr!1WrzHwsU4!6aA&}hPs8C2il*}b^eQfD$xi1w=3Lo zSMH*N=K%VAZEG~=Q-zV|RTK0cpk=BXd3EM;$b!{51e2KnG7Avm^#mAL4vBLh=J*`v z2U#8daOG;r=$nwa+^J}G7uOy#=F;!V{sn)Xe-oOGtn}$c#vD%rlWWIqrPB&@ZV1b2 zQDga(0!JuWWn5qv+0>+Sj0J(?6{3x!IW}+>0CUS$kYbQpCpK2lT8OkJjt0^MFpY+m zy1wdcIF_w{2o7$bIZU-*K0~deavx2egg!5#;hj4=ovjWX9iQStl>Vq6n~&< z6SRp|FJ^4`43Br>O)bnEy&8bMiAVr@vq~t=vL4zlk)wjB^th!i~iv1%-ZM zyV22@y1tD_=m^5P{NXkPD}mulMr;`$+mxoL8sAf-8mE6c#DD{R0g2eav2pA5C4+&m zuk3wLpW(ZW(%nyf!6oqKEI4<(7jEmIb)6zZ8vLZgeIhq-JvIlpz#^>55ZSObO0X!j06_vy0O$bHYi{iRK9 zNdO_ZvC8a7x5&j+FzgNYR{7`-zK5lT&on$!C(_)^GKRd`b6lzwzEgAz%XQr^e8 z6qE;xgyu85!{IQ5q#MIjy@}^W-}Ud8laliublCL`cuL9)gC#@6<+Vr&RyJxd@sd~MpdX67 zpDFf@M^w6CiTz7^#jeG4s}an-Q~jBdm)+6Vs*$oCe3iQ zii+EUaHS2us6E<~d5ZqjoL;Byipkwt_;jor{UB_MBBzS>L!pKj7|x0-n5nw6Ls-X+ za7N1@GEv}0dFNW*Q4R=Kj9ji>1$vZf`;^L!J>AJfneI7^r-6Ll#iYfE2?~ih`2bco z+DC}>X)(5Kp=96`blT-PrDqcxKQ4f(zdWTkVfw=q@k116^c z5$f2_8bhcvXtJmO%tJ+*yoIk&f=5`Rg(5Bh9Wn}TFQ`_5a zEM%D`(8dy-?uuL*AC$|iLKxB;xgpfVme`AaN?_HdH;f(@Ko+CIPFHxP+6y3Eeo?ps{fY!LzQ zvbEd|bcXzQ28jm?0V;}oYjyPz9kJJ84W?$TnxG;V|N!qsivGr%tOQGtKv?G6!epPrjd%jzhQ zY5qZ1twA-DRJo09$B4FBM^7Rx_;c|AJ=pR`fudr9}6>G*F2(7dY>x0 zv_r2b#mxBdI+4d0f*Rzrmc=uyEx;*FSLgB4H5-%Cx-NDI_KwDW zbPBNT0qIX5>OE&YBRD~haa8BC@8f9u#t<}tir=X&h=0Hsi&Kh;c4Fx$lH61-EBQ$& z>Rh;4@Q_-U?7Da)0!KhzPXlse4_r!P$ynD1sN1id@oIa$*{PPT=V&~Sv>egYS8XH0 z;?dsIy5Qp|<^H(dcat5XG=M?z#jRxqZCixJVjaNVnPSJWBfdG^u`&&)xvY3?8%M z1ysgD5cR5T~pKtULRQ7r2znY7kMXu2=94-I&D0oYh zZAokiPc@ zn6Fp_W@4S9vzpQ^Y;VUZQm=4E&K)|*kIia~$`ejzuEpYOHlUxZGsDDSwQECXz;7mx ziAV?0L|pFeC8tR%FrIO9PJe|;~9fWAgJZpvJiGE<3M!$KDj_eEjzozd0 zSG7g>|Ks8ghEC3=PW1Y}%Bw-ZRX)rq9>c#_Py z-!HuINko<5xqpGDSfj*1qO?dXDGxd*ZbTabm)){DQjmV@Yp1fBmj>H3 z!G)HI>O@;t(MSPDWLAw_k_YQB>pK^EK7Q{a-6hR5zuz`g3rbR;$s(>O=8CH0XbpB* zK*LPZWkgkfG|xYLv-B_efAMV~KKq9cN`Bok)f&7Vzyz<)9o~67)AOfuKl&fk)a7ii z&+sYJ%#m9*#=pnYaLcHwL_5UXD%NT``oOrWLGgVd?RMF^lkZ9E1CY=D$)WQd9+F`` zWje^RjX)|D+YD_`2Ja8jooAm+1W9>(Z=IQ6XUOlwGtBMDp14#l(GQbHhbgV0ApPt3 z{RpL->5V`bCCn3{CNTSSHLXP3WtvJ8IHYt$kzT2#>`E%GYeTNeQb=}2db;Un9e|$d z3w!4HsS#8vDzb9Js1Y?*s>svpduA;N>{f*Z27rGZDRx+dgw>d>4L`>!k<0@YS*-RIKv)+akrjUwkz$1tG8BP)DMC(MmmO`Iihyo2G}f zyeJL5oeFH98zdea1f6H6JpNrP)qu621)67dlx8BBM?8CHl85+B0F>{4IuYslg8Tn@ z!$DyN6}>^W4>q^@IRRNH%otfkv6<)5WQ-vL1~persqR=azfU1?c5Egos-?+Fkk7Te zZn5aEpTvsHcL5e-oUZ1YFvi>m@Gpx5H&l!r+>&GE{8psCrb#6&)=%^sx4++~TX{Dj z(U`=HyA1CFdWH)a24%oMLVg6dus(WbEE>i!j|nFkiRcI>m+GI+@sA?KTy{Xt5%q+l z3uVM$MRq0RTd=7{Wi|no_xzh&y_9aK{AjFVkKbs|e+G(7@a+z>NZPh4j~l@YrbZI4AAqEZRgKKA!wiNrJVxhuwa6DosrkcD#g%peKSW0ayxQ`vl_|Ld-u{A zCv|H)Dxm)_t~)*ZKmo(BTJ7{IjP0zcIvJ1{TF&$3)pkAZjY$OeAE{ud@!3bCJ4542 zVt+M|p}ZS*s^y#t5tEiJ^fYuSOzp%e{Uv!Ptq-r^Fc!0=ylx z$Y}}M!hnlEWA=iRyi!q5;t^4fmWdKmZ0^R;M&Ffj^*XYqEmdto4POchg&EUbwwb&T zVSgzZO!Qau02Vr8qrkV|nih7XnjO7k%nqcnuVlI#ta=R|gNlz3w8>CV_L3tb7yC-% z0iI>$LF4`C?c%RT%zN?Kq}&IKcdFfF;uY$?DVlt}ax6JU^1iy_@sPE277;5PEClg^ zpZGx~7Jv=??E3K495b&k!h>t^!C#eC#8yoI`YS9tpw=+I^|W1tXfaLx6*?E$Z)3ZMRop! zWmi2WtX!E`DQC6dMs z#S-H%6rNl)e~#>fZ|K&QoXsNtky4s=RGW_(8F=>0ZzNlrMr0RH# zepIB8YJ3zl!px6%Q0`ZysxGb=#1Ly?P7HL(Q=7(}3(*qYl-%F4RBHiu%vcZ-!94UM zM{6+O=brYrTNX*iRO=F=P?CqU)3_{pvf%^J6(%=QHB19uROXEK#-9f`RB?u#ZRI)z~m=YtS zVjQS3tfV_pzz9>a?emTW8MFqBWf8iU{Wb-Fh^p_|x&yi!gUh12=$(OynY>KN*9t^Ppg&c`wmWK z#RV-MZ7Rep`fL08b%{R54>oi zkFimqR6NC6^Et1(q8umrV-}h*4RN?2x^Ay~=cCevWe*aDwJ-nX<-)H$yvxg9I8Q#qK5>FAwb8&2mtP5P9Z= z2Z7|rTGhR}OIv^?XUx9A0&urvvG#Q|-@1AM$Rvj^5p=}P$H+EyP@~vg=&~`AZQC%E zV%SVl!^lr<6~#hW2nYy9XNQo*@4phxOVm#i}9I>kS3LHW+#)%(dEU)NbL025GWYBzf{LTIWqiunBl<)cZ z?0QFvIEmTb9)m+{>2jL`6NMww0|&!Fa3COfgj1WwyEzEdk6-n-30J zs`GV{Q0o4d*u9&;i^agHIO<*~yoVv`DPhZTc!?!c_@10)shZCr1`Zn=RFAT*E38;H z;y=A)k(~|2?AT6hZ(&XYVqv4V;HpuLXaZ;sf9|?AC$~L>m0Pz?itezUS!jJviD$42 ztl0m%eHRf(+m&T6ceQU#cRGY~wiBX)u-+|lY3^~sx}GF^Y}SOlK6z+cGC9V_m5BRj z#Y&R2RL(ykD@AdjWf_dZ^YCUf_8en&CR`J)GB=gN(_=ycFTIX(5dP$qD!x8 z88%vJ^e7wov!M%da?|ZH-27<8cix;J+|=C9nB|cPQJOT3NJ)FCd9;xZNl9(si<1;Z zK)=5st5A{3bd1fW6tr2!#tEF|D%%dw@f9)QngWE@U_99_7)3Wis-9q%!xi-spSuy5T>CqJ^u_LXKp)rpA&d$knNol8tx+l_T>)R$W0sC~+i z<}EvSC3}bi*o)a1{*w{z7G&=su~3-u<3SbKQM!4tSE2bmKsVm{o#R*&;ok z(w9>I`C%Nx{-iBDSu;}iSVBQL5o+CL4TipPX|2_&bzGTIthE}mOL|8bC()f!mmnrV zixcBtbEP|^TmM%9D~fs#!Ht#51UM9ZR*x8k{QUB|ymEbzG+E&+yt5GgEo|W>nD~n4 zy$s7H-Zufku{hsUQb1mgm=xbv;DlQ7L@&iJf(d&scD$m_nWLOhJ=q)h59_>=FRv$* zOV6(1n;L!s)PgL9)A>VEAQlnPiCjr|6HpIrzr(z)2?&M)HAul75Uf1~iq2@h(oVAz zuqNXO2Jq%0D|!@VL^p3~74+6r9E8@Faa*LacQJ>eDZ0 zKW`%zF(q=8Fc7Z%Q~6gxfI-o5wVazImmal3LNA(Tp?fcyIFUeC;IX)WZormZ*jIAR z(a*f$2AUtxSkI%M5u=x_F1-^`pISfz==BP6dERJIz1E4Qt%}QIoVn5P_<`+?WyNe} zg9MRf`S1n%}eZB(kA0BKOez#q+i;n`G^%-5Zd( zwVT*mAm!$fS&E{G91ZbAAeoN!CenRwv;+;NnU+8AF0eP*mA>N(KiKf{K$Ld~YE43v zFv@U7_4CiLJ=y+k-eceIx z9d#~SEFSd5PZ)V-iM~4A9N{D_n-T{Z{^o$X;4BL4gF=zwE~^@4p2i9FChL)xc4+H7 z75k8Afy(2))%g<*0Gb?k3aA*o9cTTbfznHcUn-SP@cP@z@ALXhzfeI4Lc*y%`Ixdl zcp7gsrUlTBZx*^G8c z}AH63q>aC7u2XWT9q>ey|`j>3jOVMl)vNihQt1ATTXhW*YAG@ziYYx zm&p2IV&7n>->SE&*3+FY#e>>Z0g-MS8V)nFpf4v4{j*r>61PNqFZKtX5h3vcAAMij zd=Ow}!VHsi$jY_}5&A4B6>b$^&zQ_h1M=2*T){8iHGn5_LZoRf5yg&1#pF!7?K{R| z86%M|qF8XYkE9yKAbG!a<&PHXl`J=kwDZGrwVTH-dJ_C@XXu2%yx)2vb+Dj=KI=t2 z%m93^{Ltj>;1)>$1?B7Q@u2=5AXseRCgRu)F#(lZ=m>v*YhL_!%lmG6-Z$`nC2{}3 z$e#>c=VyLxq&E-%00jSI689f_7n7~YZ;lqKs`zUph4F(=_z^fA-!9z-yWCSC0Jaki zM76ky(ydRYLr}?<3t3N+OUgdniBGv2lI)xj)s`;dQ^Mz;7m9eh9O(#N$+ivoGqnd; z95gC6y#CHcvpbUlN}(KBWxZvA%82r?=F_r%4$$w`BSK*tDljRQZULnBFB9rlu{|)j zWwjZ$KF?<{81{DgeSQ3V@uE$jg2UFYja@j8H)-(&Xd~NM&M-%;WYaHsv8;(N_!g_o zhoEz13oSQnTbyrL&o}Q>KNP%R#*B6C8Ug)!T+|}Q8a(OzYWw#muqwLaI_svznRp<+ z3is;MZ^{sRqio5{fw^U)LPO=3CU1~fdkcCU^4%Eo;~XeS#j$}-Fb$rN7)kD|AuG$J zx)a`1drP0Vo{AU~RaR7Mxzu7wgl5fG)evqUz87U$Ns^p+m2Z<1;bD}$0KJTOy&$Ii zqNCZXX3ZE}y6Lu$@bl%(2js1U`*a#y1$4=^YMZOGc&{oW2=M00U>Tgz|0II}_=NHF zAxsPz1Leo1k<;_Fq88fG{22)4i7E-`rA3 zxgso+UzG=ytB7w3nT))TFC{GHmd=zaIH6`qe54Uq=$$^=pq!AUT+B-*ohJVUrK+j~ z3zyl}u|I+qO%ifsT(p(INi$5ozGZZfwm-Wx=-WYlz5dT~g zT^o(R=jhWh3$QUuygfE0Y(n1hRQy~sLA30_>%`WJiWV~wzlqJXM zse-PjJ2&m_vsrapw*hvBakpGW&fz9tO3lCeHUyp0fV z8zXm}W{>!_4$8rBp^*zL_=`-qqw$gEmmJ2FxpJ1N&LD9lMhFIQ;NwvjKiga00N}fD z1SX>k6U5N*8{OT$H`|GV2swE?ba1UaTh}0v2biAYAf)`JDmG9(^e2a&c;X?)kcHW@ zS*0rVOkBL2yzQH%q?FN7qcKCG0}z{LmQ(jhq$Lx`Bay3&_xEq5o-%BpJmi*f^m6ig zJpP}LqtWFc^-Nz0w?zmflAO_^{q~X#f#uUILCj#z;1Vt!{3tpIGv|}{Er0QTv>KH9 zf??^OMKWU9&L9Kxz5AxyR;4WxASG8j*6m0s&znX51*_S^|D{<{Ay9m_%dfHH z;KjkvPo+uLA%ia93xb3r2v-mUbw(2DMX{{CxieqWnIKdo+pf{*IdW0{6R?Gn*%1}3 z^N+=xC&PUt+BI?WIZukllQ1JBgtSBamdLmomI53OR{$vJxeM%yiesZwA5Y+h?OMH8 ztrl}d6S3E|H%Y-V4-k1q;MK(;V8}SWO(~tATa&oaB^&q(FVYfB6*C*T83Ss9uyn4T z&Yb-=1GGu(YQ9kov@d!c^}P4Q7C9ra>1M2hf~h7xCo<|}s9b(7FUrH*VGX%T)o0NJ zSI*-^%QjPE7?XmZdav0Uss789@-;P&Vk_VXs2H%^ic298B^IB#koNS{4Q_ zBv8@85(E(_0v3n&76*yZ4M=%IN#nEb>6V%p7sFTWu4#l-d7W;Ke`1=^n?w)$=1Os3 zG=JcDfIR2rzG{@u>=P_8}l!GZ^wv*bP^3+j42H3VHBvB9!?V!Vd8H>Oe`FZeAtN#W^w|-l;OVn8fy&R zPA~NzFI%e(wu9$uTg;=x0K9yw zyG55}U@g}Assv_z~{n!;5J9Fh; zxz+(p;72CTcWRar0S&bV{(65jf`=9QQ$i=Anl6#*yjc0wpSpF!ATe~iq1;KZ=;SsT zgll&|Ok@PT$BxcOPNF_n_7QwK|LNFY>+p#p4ciHV4l#&!BJ*d^JTf{7FEbO4?q!1w z)Fs9mRMVqT4Q84l65uGYSsW+z?yKU3=4s6mW<5qcWI)y>kVz9XSQ=m|g1?I4LgdKY zX;J2ItRqcwIUK>W?s;wA&`2z2i+zLSv(B8lfeUqLtSA^uI;p6KCcGH`DwHQH7(LWV z6Sdn}51qpU`uy>Tl#T^TcnPd%ho(AouW^~9=OUqJxeITAEv!&^L`iFx9heXBlqXwG znFtQ`-#Q17w3hyW08Fqbz#l|NlPguMl;YX`9vzikh^3qI%32ciUF7BDpTyg=`5>Ev zo5I>L)RB441+sK{mV`dalp}9!Qp5_f&`1W}V3e8biggoDHSiwUdIwxCSU~S;_?k@p zU|n-ui`bt;}!|?dchZG^WEr6s%!RCp& z{|#s(130|}N0x0Gc&X=l#elByxbG?LrA}t7)Ra*M>}3-fT9+SM7aV$zVW|W+W3ay> zMdhZRVlytayG@(ZlGy(2w`l3kRSac@kZu`+#PxWZ)*~hx2N#V_NmyX+ZGr@ zdWcRTVq_48y9l~D$r=s)WJ& z#1<_@2BedQ_8vT>kMb#=e=O9;0%V3*mbDiU*lG*!&K(fm^CnnK6v)x;>V}unS2?a6 z)7Sz*zag{rQn(TJn`>^$A2h2AxTxv=j+$h`Sktw%jmroab{vmBOB1QzlG@ZbUYj|} zWrhNxHMf;TqOftYg7?52r#)fnE?}sCDOoZ!rJMGx(KZ3&DT8I!j*1{vnP7`@O10L` z3RprwrbQ&;9;gSl%$DJ=&(&>xh(B8lBAT3DarDyAWMNwQ-!7A=Kq9$?gm~ZCN&|UL zdV2hn?|x&J+UECkBisB6*-KgtPwcz2=(w! zZpLIf4u7mFwW`)ic4@zH%b#cqJD+^JF zeGmbDW)=d$eAcn}LqciC($4j8VGr$)?*YZn0L90p>_nsIH`2G5Hz&s_vFW3dgVE&t zb~_HtQAy9LX(~LJ>B9DuvWHW)BF{W~d5AVqHOFOX!Krp*cHtl=$HgJtf`NH`sE1{) zWs-t6?_v{ss%qZ&6(GfXb0m5YFo*F)y~dlBxV3WoVfy+7=JNy8v$G7X#Esr$4)YiZ zaPeWH*I;Tbc+qtZe<+W?=IGF%KZ%%~n75;A&hCMl(pFdQUt}KeR%(&}lAy|Euys|Z zx9399VHhh%m}$W(-&&o=ca4KCEF{K6hUmDtqLV> ztP(MD946f}NF}Olw~K7~ZdN*9A4bb$8r&*$zZqZ4nr>JgrQJ{omK^rjKtDDxY_JI9 z0*scN!jq+Zkqn%XQrtvQOU2^gSN3N8)yp#B?R=@72B2(oq%IoR0Ayi=A&VqgEb@*+ zUjhyslzR@WBLPT&TXGRYl#WDgU$JS~xx4ZHyfMB7F1nQ!IBNo#C%grrUGXuRu9~^* zOj!%?TV9c&GK}$R{F==Cl=HRvU4f~6K$xG={C$Ct!=s^INqE*lnpEk2mvU5e zwq^w#(65L~p+1Zi$aDyRKoO%6(c{vBeuf{+9|HQM5X3Iepw@5v_7@FU8+fL|{LQV# z)vjl}La}68_JQEF0OmT8{&E57d1i4R!dLl<`OlTa4fz$&5dxJx&dGSp4O7P4LGvx* z)R14Z;|0v#qabHgldgA?bcW)RG#(fc zZDfZhh)r)3^KWulHnX!kwMIX*kQHX2Fv+8KQNyvhHD?!20UKeP)1LhSc_@7*p@kUO zAjOzZ(rXh%@vHKRzqN+x5;9$h;oYBqaAJiQv`cx0{JdnCO@Iy!{}_Q?7OZrfhx;(K zi}F_d81KT$%GE3BU1!8ZxrOeiv7L)myyF}hbDa|>8)rQ6DT4nz>g%R}S!;@+2&D)5 zqwm}Z#Xo)?x7VBE?FEaj#1jahC$UklS)PEhYPgi0yK0Wlr`)-tjdJX)0a-94J)95p zMFe5Uu+@dMfdi~wtS-TxsnMXs@};~rUQsX);U|*@0esdT$z2H4InroR=_5u|j|;5N zpvLR=dzT-!jxc6Zx$bvYGRWs-{wn8-o5WR}O3?#%{d}*3%&iVMv6a&t7{9YamYUmSxF0T_*J?ve zJ~$tx9RKIpVLdj*6uySYvTC5iM_X`(f~**cH-~+}aG;@gA>NP%zrZ5lT_e%gfYge? zmr3OlW(yG_ml5~Tg`vNynM*va;(R2YlHjm(ygyGuiJ>6+S>nU7O4H%igUOX92~Pdu zDrrDuWK64A+k}@a;>^lwznU%k*1y!ggrGnE&~OU#$)|O{^gt~Ty!J7^!kLmE-&!tv z$o65M%y3NAdvrGQ$*i^zzB0|dL=*`hvfAk90c_}s}8qUS=<>&C~iOmFDT z;&b{N&C$n)ql2#ZNyysx>nv|9MO}*q=gYk!BAu}O-x?NY@oTt_3EfZESgmf4`}3^J zNK2FM65|;gm#7M4FvnK%nLgvgA}UM}+@{@Q;H?4*IeESO;!l>k_x_YqpzGr04?ZR> z+k)h4c-R9njIx(899>TWHAUY}72h+#S%(1x=;D|3aeT>Eml96EhBQY|OQo?V%fow7vf+ zm;>#-osIbd{=dOKAgUs*=P%gL7+Wf-&BW!b9?2)GqfUnRoE;$#} zjM}Usa70o&t2WeK=;kOi0!)h`+yv~KzRSyTMO4&ekDtr}*Y4wgx2ca^Z7wGCiuFo) z6#Nn$j-4M}?#vI%_E}d|;Ai(V&?k{utxB;x`EBw!;y!%R4QWrKf=pM){E%V%nZXLD zCrM8&EzK=(=!C>HwDpvv3oLmcI)!#p$YQ{XB!E*A-Fb&&efbt-4Jhl9X?i2i%Ljqr zAM(1_$bM4B#AEX;4^+FB^q*Y4jFGF zh*$9=gcF{JYfSJ9gRvfmwd!Ji3NaIS9_u_lXQojmUj*4`jIs`_j{h84AhL3X=R`WoITOuAzKrTe^G~LTK-TEL z%E&;^Tf^E=7LX!r%D5x$QW9aO=7i*1fWC`o7yWa$^DWrF|jk$t@g$=w;j5BEE9s za{vSN^4WRR?=_pXR$*_W!`@tk{4Ll^u}KRH3Af=uhdpWE6JpUi4bhDR`$JahiWQ>A z4aiIX(S!7*iz!>Ezf?p;3R2007srh# ztL62`1BmhMZWg-n7cd(zf`b<@cg!n{aBTBeYl()>6!^M9xR?KVG<#-jfh8YUvi6{J zw*iyV6cJ0oi7o_0i~;qVp7?aP5AYu$a3^~qIf@q*LRhEZm@5ZyWiH^f-I<4xE&LzM zu2*O?k=Mq(>@E8>QDQM_;#k4&8p}_h8mIma^=x@aLRyE}1v@RmzcBRiy|vvvTJt1! z{$}Ip&e;k9aN;s3p06_7pAa_qho|Zb%aWXlZ#*GFR+yr!pQ0#!G(@fNP^88>?9XPRX@}eLISP?d6YdtqR zVS__?^BpsKBjWw>TUISn%ZL!y1$a8-l^8m?!H9(J5!bc`Gd9jDIF_0yj_x(&&*>68 zOj9=r_evV2l!O!>i*_8;uVcV2{LdVKLIMF_%&$mW0p!OyAYzz$jbpfhKJnEl+}y4j z_E(b?7@i-_3#q|&8ANJa80wG1AA$7cv zeuZ(&bps1&O;QTTUp9^)d1#r(0pLx}z@!;OplWm#w8N$m$!J~#k?2&=93U`b(7LU= z*F~&0lE63{H#G^9V|JbC&!9w%ms=C(=@%$&Eua^29D>h)?gjriatdyQ>0jpz74-8% zCjz{HpGv(Hx@QEx5k3FWhP=#(iYn^|An&V1$HQeb4eBu{J|9QGNv9hZur-Mm;^79$ zYiIy28f;g&X z8)I8qOmyEMM!H7h*M_^`o|biD5W%-Q3q1QN0IBC2G08ba&bi(PVmkP$TM1kJ*ovvKw^YJGFus_Wu4M26=7hT%;#^~I&0Y@ zMUZ)@+9uJO-ss7pjUi6FL*1GZwK=s5&*Wh_?gQv88QWu6ZmCOU_=Tuv!_yx{^>D`l zk5$KzeU{YB8l@ZG5wgFGe?PsIr}jmNslZtYMzJ$FA-+51+9s-AX9&Fur|PELM6-@X zDPh`|3(_5R;4aZ;)7rK{%{AW9v;3fV=2mTh3wqW-u-HUY?4D3%hW$;#kR_4_aBfV= zuuCR@^esP+X4?ZErnDr>7nMs4DzR$Xk<6rJkHTX?%RE*Knt^F0WfEhWvZpvkPVl$i z?Zo9JB3}FJyD!f2`@)M&*w{)fjRIAitJTZ{quGx9OOE??tWllBmsq9L<9^GMoK&d@rX!= z@4&EGd5AsY$Z9oYFGs^#L}1rs1Dp0neGY}0);CsB>|>1m&h;Ah=8IO$vpx^8^X?LD ziwTOlcW-;PqTqG`YEnLG-A)`le>oy`65QfVI#m2)KTmzA(5^tq@AJG}*(X7g z+>+M&2N3|CDQ$<{x?*=4%U`o?B<|pGZf&IRCK6m-N0|&uXfW*0(rZ&d#ZC~0%>E9$ zaSBqC@yv8RmtX^+7?L`P89io3AuMyDfWL7{Aiu`kgKsfOM?EJttZM8qZld8mYIguo zUNbrsHrmIchj$OSKU>7-{{RW^5-`xw{;cHHo3Xenc2~dyq~n^Rj7ZZy;wxAxR^7*2 zkHtlkZ%cFqChm2T0W1%YofnA1>5O)mz$g_H6JvobC|!p1r*NAaMtD*uFo*a?jQ)FT zupDn*BQ0p!RoP655OI`-Z_u&mS2n($!V$3Ibu3$IK}h$*9wYjQHD)7clMYjd1Z$(X_7JIAE#l+ zgd=uF_+^B$yK$PV;C4lEwRaeSV;?JtkOgI=fsQ&f*n=7(P!RF?gPJoIa1x(sZ*VWE zMgQ~qX^UhO2wX9=T4BDDdJ1SZ1NBHqPhzNYoh;7mw$f2 zepw6JYjZ+Bub;5k`(a6ycI7Fr7v9uZU>3gtl0o+$;jf%?vjpPzZ(Jqy{wsA)2jp{^lYVL)*kJ`=zC-)Y!@rT!p4 z!0Fd(MSre{Cv!n+I}a+q9_xZxOc!a#8>R{NGPoaLj6aNXvQ`b$E*BKQe(#@1?X+g_ zu2lMeXL{GQ3J3pL5A100fc9d+T~eu$9PIau-6$Ji*FtrMTTU#PPW&*bqD-Pxh{AV-G240}He zCgiea=$76W*sscjV;4b3;#lG!nb62i|DvmQcC_5`n5hCWI?Ou@-&S%_veNDCcliHC z?*AE{qZO$&tNlHmNBlLHCivfv>7AX-tsMX7?3{N@-#8E>Owf&YguXo0d@2GwG*!TM zabp?wsx`?}M0eNI3A(Kg(8Cl-diBMon2r3=g`WB#>9Y~w2%|b$_Bob!I9@(UP?r0l zo^qGti9dDuBo?FsQvO&xak3PoqhwL7f9=f-eiTh~;KYVqhff1|*VE-2;11ANqf< z_CEwA)POSDF3?}AWJCY}ivQDU|BI8LYiMiZV(j2(Zu_5hMU|Sh?FJjdHy**y9+gvR zBbV5L?L!Kh@CqQ{y1}ZVp5GB>qsW96i6UtkM|J8?H~0B(54rt~0lbcsZzdZLFV~ld>%4X zq!{Ox4IMHy>-wt{IN*ek9Q>ZXU(Z`3J==gDM^gl5#WY1i4}(-) z;7xsVBO~floP68_a|IO1&3SWkViiE+yi&(6I7`jwa>}wFB%oW{r7uBUj%nW^M=(Fu zmZz`y@zy5n9Ghv%ibb0*JFtI@CUyEzX{KiY`1KJ+JgPJtp!+}qNI~Wf5y$Z3hamcK zkw)OXWjcrtOQ0M zvalE4XCC-6jMJB3!&ZcMQhX&lG`_@Iyh9GRqQE7#WzEBkXcKTZtY{WAuebtO1787o#O=EVrIU=7+Ivuq zKI_=I%E!r+nr+7RmwgcdnI_%W4iXgQSVKF+(M-lB(zip5s!pq-_}GIJv@;Ir=e%D zbtKIyat!lg&{pa*GE{92oyiGr-{)d#UIC~Nss{TtNSo*tiu#eOx}ZdfQcAC$r!nT? z4;}B>M_|xaBCOA!G)LGdYNg!WOfIqaW)ovpaIe^XLzeH^D?ltQ$tkm=`SMfQ7COhR z+tuR`fh(RT5zDzHEA`$vl8qEsO6Ky!3*-3l9t%#3Dw@d{JA7`Ey5Ax&OTxwVGA)ls0qPfN}QI<3pn)pvFP zq{taq?9Ux<42@|>ff&f1cNU!XCXo}yi+?jSJ~@ucr~f6ZP^Tv3V`l0l=9OrAT2?R) z!8eQvcOYw%^!9KwnNJ$<6f37GP1=~J;@nbab$cPtO#%D7@mU~;(d%=pTSc7Rp_H&p zAhnd5GG-vbCqXde(1n6!I+@Rl_0QZqD-h$oTVy^`x1muD8DfzE=IrF~sHctxdC1*n zOJ=k2H?emsuRdrWkbFnpD3%5CN2gamghU0uLLRCP5^qHI=(=bJ9963SgSkFo83WJ} z>d~XW{+F(*f~g}W4YJXp!2*c<2I=GQ@&UB2WX8UJO=ruubh%WK+x}V5(|&B2!G;-` z12(dK+j@K~bibtO=;B!pVXAA-ib^B*fTrUJY&9+1led4+ z#a6~{gkgYsLDFb?Q(ecPN#)N<+B+m9pBw>EqW6_BwUh7=YOWq7SXGLyvq>k>1<0Sx zLeBA`eQi}2nf1rHk{FmEIwnW28fk4d%vEbnXYd!GW7%;lmE?c5$JwOL*ATV$>{CdV zLZ=xHi)Wt;Re5JGftrRbMrV-uTUea0<6n=K>3KsfaJM=0Msk+XMZ*c^mM;A^caKP+ z{!07-G{f<`Zw~}mAzf%qX;Z1Jy$|vKKzGdor@947-$VBL2eQihNVhfFci;AMA)Fu|a`goELF$FY0;|YMmnjzttwoW9ktyn}k2=Z1T)gvQ%iX$a z1EZx*CiwNI&zs32r(h}h6OMwK#@j!2UmZ-C7zB*&qR-c%qodXB;|Xbmij9zz1g2Qc zz^KUUKM8Ra%Lo-e{am?_o&ahG*-25%bh}JJ=Q6=Q72r;F)}Y;aygq3<71|%7RHD%b zTFw!_5133Cian0Yml27URqyyrLcNgAr^`45#cMn#O_f+KFsGnKU!%h|i^zuVq{_Lw zZQy-@O|}ZBVpfORMT@Jjhq!Y3EBw|Qwq=)8?BuIG(%>h1;gw<_CP?L&vdebzTR{e| zv!f(}niT?tb=UPozQLTHVc3D)Sd8**`}%%e()La;(c|Rza3smk>f&a|&+b@2E8X9) z@)-Z@=`jobG?wD!)AxS7?%77>=E~pt;(mQP7#$vj`qGT6M|RivmI??ED~<740h>5) zijo|IL0^GKZl^>OfsTIQo+ypTf*%5ALJ($y-*Axl_{)xly0>v9hT8<2Bhp?N)5R^tXG-PVkGWm7>&*HrVQRMacbHH5<4)`(S71WWg9HK&x=XS6@k2QfPKQ z0;h^Zw!#@L6rAl3zjUMP)S4H*0#%=a$u@d8{#8JIDa~4vtqJ##49V6e!Q(Wab(O0` zpF1-th05Wykm;KRw|VYHcOQ50~_9GEmEH-cVrIQ z{uYR5W4eP+>d8QTJohru-{ZQ;5b8RMq8&v&cwDbyorZ|`UJkyjbg|gt`r*uC+vAl; zws98a-HTV>ieWR~1-(e5vm_?YEL)0}242Hk<*A>+^;s-*j_Jp@g43(R6dkud8)y)2 zXu8t)SoHs>&kpH>OVyEAg=BbQyGsnF&%+a->LlUu8*EVk5DQ`)=Z zjoJMo^!Hr(k}ypi`O$mMMJ&}b!1_F!H{s&vzc--&RNNg>vs4nl)iweU008d)sRsS8 zy8Cz6QTa0`HOPSAHKP{wRGi$fdqEg63JIPQ=4z`-x?)wcw_aq`obuthuq?cpIwt30 zLN1vg!ux(b`1}WYo@pRN@7-pwSg7)}f04Dli1EoCFEAM^ zKl3!dgc21QY=<4pGvFa{eYHHIGcx~~1b8a&>J78%xbQA;;MHGqvB-T_IL$Sd6?_1L z;jaO=>;7C_Y{vkNJMr1tmm2LIOS;tO(c0l>4y1d6$f$6SJg4N1Ej?5!DxLQajh22y z#l-#>u{7E)$dK&d8p(wC<-E+-|HuJuMrdlke;wfW@Bi0@@%yx~bvD&^qW|B(&i@n& z`6XVfK|!14%>BZLvfraltpD{pLo;JTOFLWh|56BP`o95#P4CZMo=;j7r^7reuE`{5 zJ&QzOvE*F)YG?t1S$I9?M1hpzb7#Siw`-9O#fW4JytMx!-~Dxtr)LagfF4I{(veR1 z(YQcb)GjSM%=Xcwk)rh@hD`(skm_%mkU0#%tS;Qs<4CIs`cIm$lQ0?w>^Uad3qGb) z79B@TLk(?Ve-ML77vT&GfC~xS$egI?fLHTVb*BX(E%sNY$iXOkCSXo-v5DHp@dgLs zQgQ$`(Ty;GG*FT<-tA@)LZiWlv+&>7rK*m$0j>Wq>n_TCj*M&SOGsCtLyI1AeQjEZ zqtZ|~?9ujWjo|O@3C0AWHk9JcmPzTaEEi20ogy)W)Rkyc)p87>N)%F+HX(~R$uh4d zFWW?IGolo`d;We}vgYY&CHwntPQ%HgfSqOr_?{Vur-C?F=s+AhrSd4Hf+m&wB1YDS z&(r>U;^CoTkKhQCW!bRgU^2YVKF3Ts~YXv*zL-;?^-MyMyP&?d@=$P|y}3S)f77tP8Yl zPwbS~S<;brE8wS3F>vz%Kj}&9g^;-fY^k3#hv_vFHZ|1!euoL|%2rEH>tbTlUWoyR zMgl10ggUb!Ot_Q=ltlzNhN5B(P(u#fB@bvqNolbj#NEt~+LXx2BPj6*2c|1>WEF($ z#I=ICD3M3%OS%#*a^`m4Ya_Jd$?xYP6~ajxbrA3Zb#j^$y63<|Wvk#63m2F7Pf>5S z`sl$Va8!551QnIaa26}opgpSUhF^OL^~j#L%cHk9d3y1j@+nw0J(ZGCLqf7?ZQJ|h z?Y_k;l#Hs_Oow&0f5MYLDMzUP2IqUWcS)&pDy1oD(vA-n6UH#uB$C~&R_oI{Q-H)` z<#3-?vPlvD!DX*CYtZ8U!X^apX7xvtymYgWx(=@$>GM2|Hy=MIyW~+5&kaB2|da?k0JNX7Hb`}N3vIXnf2x9+m@5Z z>=R$}ujtjiWO|$Xdlp-Kz=vWSj0Mz86}+>(t&;)s|BgoW8nP-dG{shJt+~F0-k4Yk zx(x(^2n!!+;Pg>*aOVQ;(iw6bdaC(tiWbffL3qKVwWEy-a>9% zJKA=dbtpiuP$s%#ZLsBNn449JHV)di#z!b~1mF{vwz$qRnyswEn>TVl@N^h9ZoXP- ziL*6`l<1>z@RREl^W;{6{YdYUL3%5ChUi>X2L{}f3bRc*{i`aekEG|>@36m`*)E-N zhkd1cOCpIq-Mo_1v-Mnx<0_!piD3QF;%7gf_Q^uHfR-JQ1=LkpTzfj_pfQ_3UfEPP zxR*TdnIjjg0@-o+s=1nFE|d=m!+wI+Sm{TDv-6sR-$^pz2lF2sf{JsFx%b9C7uNZL z_e0P*tF|o6=wX-<|ygQ^y zp<%^=$e$p{hb#tB*o|xp_{Z>*-*V8F6rwD7b`zgE<~7Oov^I*v zyC1dTwU3A9;2{4`Cf{93WA;ZLQ|ya*%@D5_t2gg9Dg@d_K3%*gCm+q;D@G<8$?RQR zXsJHY2f80Q`ue?7Y=MVeR|l!EDpS={Gl&sWQ8mvo{6Ri$Amb4QVZpP@JA_a0!$Ec7 zR0bV)Zo7}y--&XJ)?O9hZ+>r6avmy8$z#^qH9Tbu!ypZmzWI)Vre zIE!8I!-(PU|G{_p4~VFla7|PH1rSQ4008L!Ya%exclypl{S&#-N&T!XvO!o#xy z*-7-ycNv6h4p4Jx4c^PpeqlbjwJ>GVR-&4=1ln6ttFqk4UB> zOx6Co5Y$O?-ribWR?Oq;xSrUxhW?k}b+#6&v!5+W)#J1YK??*lmkBaprzzqNOcpar zK`W_aud2(kV(Jw54u33D2M?(v+n`CN0%&t16J{picWUd`{_;$D)MHyYK>@qWbJ$z|w z570k1hLdmA9%$G=OPY%Gy@G(XHR2YQ0<12<$pu3Z0Z1bcST<~rP?Y%3>QRiOfi-z~ zDMXwsm! z9NS2$uCq`G*k)`h9I^q7;i7{a^`9UHMO{eR@|P7 zZP4;HBOQ8j;@TVxm<$zZly>vD4S=lGi@7hR*<+cXC(%bvW2+Z3QkNf@RAgA``3DO} z!8#-}OGRQt>KLDMXoCEvqL7L5&toXwQtvz=x2M!IE^ssR_EI~+=VjhLy6V3NDol{` z<4SLy$8L}<*u*m@OUv46D(xm};mF(D-wLaU?iOCF=y2M3dcCRI3#J&t4Ty=6MBh&8!pV7U=SX82aQ91S#u>Q2n5#UuP&_r zfOe}+-JBer1!}$*W8^fc3i%L@sK);r z8=vU`#RDn4l<|dE9QDPa+B95T_1Q^m-d846Bk zFNl@?@%XvEOzPE0p=0~IP>WQL+6fFr%GD$tWp?QIr6Ke9=8c+RUd{0cx4N4TXGt~- z@~=mVW1T`a8^NXn<+f{Opc(twGQzxF78!|>2UhG=pg2O6(%rZ}^kYnp3k&3g3LooA zWCBzsJ^~Wz#Mb^FvmH^T-&eQo60Qr2h@#w+pZ#P~+vJ3gSg15T@FCI5;N%EXJ3f}# zP#cW^meb=yJH=VdN?fI8Evxj*IH!@61`Wsq$V#po<3b=oN32-yp2i{E z3*Cg>m5Y~XOwPyAjXnyKWTVW9IuGU}=P=5s*aceAKF=6XhT>&;v}5l;J9H!2a3y3N zK|&B)fdb2K;KboQ1gV<4;Yz%Mcj!2;>`(M>n5FuT5KuodICN=Q2ti91NF(ek$Xs^&_V=e&bctpW2l~VI#3XiR=d-cwy0qH zoqk3GKqPrmet;O+U_gCM3LUd{*J2qoLWE`cOpY+-UzK#{5q5@Fqrk z+k9b-29r2W{ocl2NwrIyP@;)&m!MqJ(T~7XfXcO7fe=wN^xY7)t$jvf(wlQB+f&PO zcEYS$Xx8Cq!#)))5un6- z8GB9Py0G;Y?%3j3XzoWq-U}1jUM~fXEhW`|pBtEW>ya&9M1e#^DZ^8XN z(F?{*E(CZVegB3PzZ&5B1S9|)V6tRbvOonIYCQ+Hmc9NX3kb9g+FhLXd5ViEAhHWs zG3BZ(Eih|lflH$0&vA`fPSyd)3e#rrE$I>A0XgPv+`M2)52t8x&a1b1Z(rH=eQOBY zDcS#bJLSbW_B4`${Bsc@+(K_tMeXGN4G6#yhq9mIjU%x0;>Ww99(UO7kyO|wz~23u zDuUH)WCFAuV3$6BOpL5Keh6B$zhA@y^7p0Y^CMGWN`aUXg})# zfa%L^RLjf{%BT#dGn>n|a%|-*i9NQ*a@?)W2}S%8aE{Vf+!itiZuDiQM~cJU0t3Wi z{|~MmE=!;>_T;GKO1Zibi}x@3ir+()(!R8Gx5_cebuA-D_DICeY>0QrvjtpQ;LELR z{uc4q{xucv+)M2O_dNn9RUi?w*+x0emB#I)omJL0kSgCMlaYSZlbMt~?H_hZ*DUEy z@54WbL0{F{JG3qlP5Bac_^)()aslRUB01I94bd>CCf1n~$=t#CXNI;qJ62%yS!95> zH~GK}JB7KZtffSxBit08wwab80Ll+EGL;_$oYbEs32jH#rQ^((hl|*ma6~8BP@1X= zhHC<68cTm!%;(hbbg-%IkCV+~Yjwq06kl)Wi^Qt0#+J0D85Es7A)8r>zxfii zBc^k&j!yGNX)KZ}#)3Ttp2teQ0~j8a(rA30lYC0<>IO6^jHP&R)DC~u4!L>=4(Dpc zC3dnlH#muvghElM!h?;mdE{yPR zJ7e*;tBv-*6{E&3=7#?<97tB~v)B+o=z@NP5A+Dr4zOq;1Q&+u3oQlOK2+D#2acAWczo_$DMmzPR4*FO+poAfn#A$@ZwtqBk z>hyed0arwg6|@t933f?iZUg>9IR_YmTcAho-&l?90OLSY^Uf~3QPsR5Kpz7|Bj{7k z+Bt@DvNU|L!;wz64`Fn6Ps%y9Aym9>yC`Q~$i$2TX*n8%tv{BiPhF`HPJ3U{oBh~s zj>CGYUYI*sLWxY>P*w*=>+Sb;d9d`#nAzuICsP$v|MuA>F?+OWiJJj=9Mt;WPrTI;;tFdaM$cpotej+Wlj<{^Akfy@|dT1!0Z zU5tTZ6%K{a+U%&S7vB$WAq%MLMM785o)S{TYgfhV@(p(G71lHV=ua@Z{>Cn0?t0o; zqBI0R7Z2By2-^2glVO*0LLyP-XUYr#&?WV@8Hwr7c~lzExxi(3SyWYEElNf zs-D$$=u5_7up$^KOdf9ZnDUxzNDOS`!exnhlSRxvZkzP-MS89d@2M?FxPi`xL=8)s zqbvW>Iji}i{g(RowcG+}`lr~b1ykJje>Bk=f5Q%eiPqBVH|zv{!;bj>2|Ha|LkC(0 zIz~E1b~+<-M<-fy8xvc48A)MLIVDkT#i^J~288YpN-@v%WgC7ehM zeW-niE?8F^t|3M(TT<~vGm|gu!i2Dg)lEvZQN`c)d>so2du`tS{lj48b>UsPGlYPg zP1I&3a2egMg_Z@d)WcNVx!MrZ^1DDVF3lsZ98*&$)~N#gf1$fWUCfRXWeGft+(IL> zfq_05*=!sZAhFC8GWU#d;>ce4wlmhYkA!Y%$DC*1o;{kCtccF?p|N*4;m~r1?4GkP z&UWbP=vEF+`{M*o!rtAY)nSyi*fRgb*!qO%jl>fM3hziS_gtr7*8AiK5069usn`Va z`^xSEUDP}TCAjoC;i)xKeAinSK}Em#;Cw|FH2Iw4kV5TXGOt?1e79)VZ{F~}g{j`4 zodTrxsChT`&qfjBTp8+@MqXHdhchR7g}J${MA_Xv6ukJ#d>Txu3RJO~%rpUL2p2D9 zPKtoZcMiWO0ePA?TbKfnv(QfgE`qta`{CD5A|dco)1G4`nbem$P}Y)7uNa;r5?xCMzfIG+a!U(p{|IPbYxkUy0j}c3B19W zOKaG$l!pl=nFN=s1K)+fm$vZCNJiY@wyzw*E2`i&dTn8?mh&L1Ye72=ue7yF>zA^e z)JqT?fcm0BOA$tzu(-H_e^limo!NgDiHc9wdYAgt`X`(_F zIui-el7vwYWe2Om>e2-eKW;DWCn7NQIUbfn${_NGs9pXK+RJ}}7t3&9Mjzpq_QDDP zK=c1Ecxh$jMMP!jtc|MvP9!R{Qc7wc6l6m7fp{gahYh0`FNtrUM1sxt5Wq$QlB5+BG;mxAJ9r- zb)h^L63mso#kn+lKzWrG!gpJ;7*kvby3dWyQj{Syk0KmGkdKyKZdr zQ<-{8&-K=h#cv@v#KybXrFmHBCieNcD}bg*t4XOuu|mppmGeptqP?^G`+nlGkk0LT z0%=N&`Z*S&_v1R|J~*qEG$L;=r(g{X&hN`ftaUdXpmm8|vjPbYP47eJFNidCAFN zgZ8)c8xl2~Vq(oQlV4j|GFNo%>oOc4(=wZRmxGeD=4AF`kc5Rq}yC#l?JfqhU^}MeewVSJgenXwm zGKPONu9rbZ!MRbYgv3VEtgN->@>Sn*b(V^8J`cd9S)@k$w2?2U6-VV!l?7^>GC~vn z)*1JadRVl#UN#6LB)mU*dq9}F2%yGIC3DZky-)>i)KN>j#5Rim(b<%Ry<>-)!-G>_ zB5U*1_yX22-flGhvLzM};ZY(TTXfYW?%yBJDzB49c^BiM;X^V&+39Dfw#D%afYn+z zGgc+sZP${m6C8R8jKQe>)`AMxjKTm8k77K47VRcB6;U*sx^r&Q6k^*>mnurXhL#pW z0;8VpkK2X+tFW^GifdW=IPMT2xCVFE;O_43n&9s45L|-0yStO%?gV#&JA?q=^1XL& z^0@cDH&v&WqL|UFlg6e^XX9I;?%v|@a1z~x;1)w} z(?E7pkXjxxe1b;-MjvDJoR<{o+k|SBug18Ru+NnBB{}I6)Fq3sHw2MEF=o^5)2(p* zgN~nW23;BSmAa9_s4Js<^C7{&aovjK^STUoZh%*Z^oF?gZsj50;zNLG;D#%Yxm>%s zq8z%v_X8#~&=ahZ={}AjkA=KnXccQ^EGj91oo($DYxeyB*cu-Kf1Y8s&n%ZvXVZA3 zdzfThm6lGysl0Lret=CHENfjg2rCcM%qOP+7oaG?I&3j=ZG>P~TP_%F13^WE-H-7O zxURNJ0aWNMJU37tl*aUJ$+Er^7`hu$n?j=zfs1}P=AsckFyPECJcFOBXGIi14L8XD zvu+CK9g+w`g4ZTn7)4)KvoI#2RT&ZA+ju$;lp6yv-9?q-K|qP9L1W091+aMOY}yK> z)bnqQ4^0HHaM+exQbeLTQ4KlN)V_!UI%N4Anr8TA3SvmiT!WzqT2$ycu$&GWO(IYh zjPm9`8x$TKxWk-XKId@hx@xDA3W+d&s^G_omV`3@Uhc?XQ^TufFyJqI6YtWtj1rKn zhfK2>I0rY$gC*{goCBT@q{BrDKPIGNtcI`&S14PBKq?7Z={JdvQjpRG_wi`#@j_`E zAr(rrwD6}!=Se}o-aKrMFxGvaGm2VW3`PQn>q-<7|9#jGE7e6i;cWGRCjd?y_lH;;7(KnoaKcG+{sSXSs z8cSg$ea=5tu^gF~v8l=B%aBE*l_(_-x0El>1)6JrsH@K>XIrol`ySo4SlJk#-1Lswz(_#A zYu=y*T@8yN?}v@X6mzer|MCvrGyO-B<5Cut&!|FmnAQ+yykr!Fq;erW6srMY?c9hz z_-%_x6ijl~Yq*8heRL~!$SK}gtb8`--%N-(#-9B^i?kbWb-`cPVpheI|2|brdmyXJ zVBo1I455M7%Av?}{gUTdF+6n-R;+QFmZOUILRoE@ZTN`h25nNeG^IwXmL>Kk4 zyN5B&%n>Xm;A_9R=nZJWx7*yb^$G2d`o?+4GV_AP((-w!8Ru!Zw{`pSfx)GvsFS7` zJABZ6fnh)*IADTvlr`W_TP&%0l*Ta7;S2a}J+lglJuNYaJsDs*`&+dKP9HBZ;j#RL z7-6jA`Z`o6vv=KlM!X!Hg0qs*%P=P34!Ows9NC4R?Y2e$~8qofv;h+)|2^TLXh!<)<{ zI>w>*{hTZZKbzW0ik9Hl?dME7HxeE${sm|;GlpuF)=G!YYDA#Kk(Z&OMa>D=(bXC^ z&jym2hh(tMMq>RWF9ey2LwTl`dX$iM^UhO$=I8s^o^;BaVm<8&_ZTerQpMa_HRdxztnrv%64+V>h`V1qT#nLC4E|8pWs09V>b_?

i2;PHz2RKO{pWyk{- znL8q#sJFo@0XV(zmSyRgC;4h%v_V+Ezz z>wWmCXU-C_SLIUQV3b+sBM%v(!NqL@1=6+^IZuI?xe{TwHzP#Qfrb_4xcBX}yYfA^ znX(w68wDa0oO6H&`jAsFPm%N9^cU&XA<;w-SayAE{s$(WSFvg&mZD2S zxFO4?FrL0p`xL^`q|TBcHv`{_?Dpmjs6Z*5aI*I-Wq#ymGYf_)sO~9X3=)kR$twsznO(x8iTWBxj+ADW-@mA(=IVbNhKj1M;x6$u8q6D-OC;THb^iFw@?w9eYOW<~*io->)% zC;mBGH^qK3D-$3#DEnc5(~9>NwLTQT0mm(4Muc>+Km-^ znN<4MhN8M>&nUgl!VmVj$bI$%m{a1$*y1E?3ka$wi(XOgZHmb|eKDb+jYv<<1-;xh zbJp8OqvRaVDLI$?>M?jvW-~)_8g(d@Kk|!<7$)&Qp1YOeu$I9rw4pF0jT>Fq9$wuvz#isvVCE0z)h_JY# z|@6W zC}I%rUoSHD8-YbD2r@}fR?FVyh$`d`povX(^U76nKnqi%ZYM`}DFCsS4IW8j5sh}8 zSOZf72`!pl>J> zp){#LMd;}zxUsd~wReV+nFf~P-?XH#4r5yPuAnv(p2e~G47RgYc|H=1WU@)GhjDs& z7AO$X?8$F3&5iyi{3F!y$H7KkX*B^Z`L_r1YmM*im+5yH>w<@mtgAWDHjl7;JQg@p zK5_&JI_=50R$0`&aEqoW+%Vu_JrcAP*{m~%TCOKUc5US< zS|}x8i%yPy7i1WQh<_)skxduUa6n51#(H`5(WCYDpv)RJJaWv zw85_g9{|(^BjAi$?tS*kR}doIMlwwkucv*(_|MtuvDr&(_}SEVt{8`nm1@SVE{{^V z++$4FVC3%+^bVYq?#$gy)8`ku<~xmblL&tl&K$bh%jsj+qk}j0boV-RJXIg3KH)@c zH*O!LS9NtD3|yZu;tc0QZ}TxvuWQ|enz4baLPI~g_S)zx}gSA!crEjRg zq039O=U)P{}%lu-nsGs3tfJ#T!tlYBhbprWorZD5cm*+tjJnf>^k`yrB-hfPPOOQ}A6p zJFo|oOfiD{C}v3g2Zq)LW5LPy!3LVTF>^Cl6vR3%d7J7^n zux@V$B1}?ArOU)hWa=4X4W*vdg}PTnLc2jLQ>Whab@gY3Tr2D!r{M|d7JNSU zIEbjdSX;sQ5_R#Wfivm}&7`ZG+-IkHxF?VF4TPZ+1PWIbLoa*pwuRRDdkYy5-yqA_ z85Y^S{sHnI4yTO>Kbx3^x%|^)~ca+@yc zLxNJuPkEHy7e)_Yt^po}*g*WDH9MGz62z^tnLM5qYvyM{7Q+>0qlouhx(26fP_~5=?lEu7Htb|ZinDima)8z@YukUKxWS|a3vnW@)U%#eHn)-9A`&Tt`^0A`367NaoVJmI&u z!EA$EUU`gaQgVU7Nn6ouVr_sVBdi1OPPoB%QsuJ!)mzL)Y_qY$iCQq`gN4%+c3GvP zVMRTTB(O~T43rp7bvt1!Aw2k@!$QaBP&H%rexeX}3WQ`EP?~CBRvG0o(o0e5HmKwN z^){}Zi&zCCLax^R>UQekSvVYy8^qbGixjU4()Y5+Q$HDG$-q&hV1L-}lRaW-{HVK} zh5ebN+qvCZse4gasXMi*p~EDHiBrSq*KLM@llL|mCgv^v$kC3IT2JAihb=XkQAO}w zQFZf1bCE~z&$O>mJLG%KC_fr2KBjF82@6Z5jeXID4P8H577A`(s&83C=NK^-sn~Z! zy-z!bxP~cwS$S&1tec zyqE_i8D*Q!*;FtgGW#=)_qVlXw&AUq(HPdu=%*zr;o)6%<^~gMUS_IrBX>{uZiYA{ z69g-LjI;4=?h6Ldjxpi~rnf?1wN>yQYfYkRBSu9RS5pPf8KU?dZFx$n3Vk9bTx?V- z%c#|LYm_SVjRUStBl&rqows6VAPic48mNnx{pFL{ZHZsQV`8TJ%lE^PV8*u0YF$T= z!fYb@C!@l3y^>qMpEZFC!$=2Fy5(^x#`O)PMxpRv`KxSqBsSV`!=yX8AASK1pP%oD z+J<|gCObHZixwdCzJSS{zA0S;#_f1hmC>a8)N7oXE9C8>0t}Tb<-wgm7+FWBOusoP z1eW-9>%Khg{u`&}+Wm3*-tlQ$YB95cef1ntkq7cw3<6p}s#7kJ`6^m2q_f=sNr<+S z2UI(=(*xs-?ZeI9E38tJw_LvkgQSsjT89q?cpM^Xn4)7o9nu}$dp}=xJ_m9S2i^%u z%4@}eWt;3`R9Tb>U=d2;a;u};fh}2~j$FrsQqo#P*gzqY%wW!(blYO{J zH>@&d_L`@AR4A5VcunumPP$R7!`>|xhuBF>6O1g5TAgfO%dGQ5U zwY;am`)A5i*7i@BFvgrO>+`3^FtRf~>xQhV;H?{S@$zti1c{w|;YUT!72sdAdQoB5 z=?SqUi^|db;y=|S#l`-(^vyBivbT&GM^MHo#gRGUE|q0)sEFmS;fv3Oc_W zLLPz|tHbq%HT}9e2no?)kQo`%5zbr;QI2*PPep+QD&+Gg@hC21_eZ?wx1bWjpO_u$ z7_oat@)H)M?3OGSlrG`|EotRrc)2inYJgEMGyNx`Y+?7zH+m5hsJ{D1P%8kBxJ) zIr)%?U?3U--U1fpS5od8#*Vs@lSj zdC96gsp>wp#CGSiJ^FaBhNRk*405j>!M9k`xjj4S^-sFt(v=T(ty-&Ciyj_Z7k_t&Zl$O@pVhUQzgI-nzn!Q`25k** z*TWzQlSdt`vCbyk1MW2OYUIaK`=^iLY)u*ShFoKl;-l*K+=RXLU4p@UcS_Vq)l9`O zD{w2ZOjV9Y`s{V-iCaHqU|B_W*f_tjdi%Wpj)}ckebdNuNch!(v`R6i+UgnCNoLsN z1CyT$v|Tm5u!<#UXW7p>o~{mo8H}&FA%4^GJR%nc!y}9(^AnR3a-X6vq%A+S!ksTY zd@wmuC;o~vI=U+`Lm7aTQ-B@KJZzw3OKV8U6hq&WKTN?#e^w7CiJ%nBa(If1tJT$% z{BVWYZ@&^pvqnu5i&Vc-E%g4InftgV9b4H|q&c>20@1A0qCK{()2L)-dVchg$j&@1 z$#i!N?FOWg52JQAU-d4NB1|jo;mCwiGnf76 z#6H={+I9KxMM+$rmbcJw#~e?JVVyDLW@4u)<0VHtS8>ExON1L$3GD<;NLOoEc#&91 zY?56tXVES?55TBsd~cmxL5i8`M97pSw86zBbKcpW)?>R(S=5-a^$;LUAsPe;*>;U? z)JuE&T5(jRz$QR#Ks(pZWZKfzpbEn+E(Wr!*nGWQGTKC2TRA2+WgsW$KD)?sww>hU zZXsm*Ne`wojk&37Mh*iL7%1P84r2Dap#a90&yaa1#yWE&g6ni&xAK(2N*=9&4m5Ur zV*(rA=?TzY^nS3b6{3`dCYs#4O=zR)LM!fZBL_i)AB`R}BhU=3Cii{c2AecJ9r#UW zu{D8G3ukFVS`W;VBVK=Uz)YrrR8sHk$N*^e33YR+C|htd;`kU)S$(UKbFF=030gu0 znA!wKVnd&tF&Fy!5{y)I|4n(BAoki_8Kb+88tGKLSmIP$Yehj^_Is*s?{w~rPfGl^ zoO$p9d5rK|gvc@Wyb@pu=tmCspN{aS1R%jZ|+sqeRrR~-VALRTU@Zsxe%Ts`V@zTMm2u_f`{Sl^Z5=A zyU2MVn#{Uu^|Siu_whj{3!pD??U(qs_ynL&hu`-=RC>dVV!X%W((koBZko4L&DDCRxPSZl|r!fh&NC&|377Zehe z%7SSR8I=?q+BOEUZB#c7nk;L1jE1g$w>DeO$r8MonDHcV_hX~%c}L8?RaackSF;HR z)6sY#dWky5I$vwqOgUIHHbbGbe2%pP@$n7TGSH?I3Z!koOr60B8_^1e)q<(S}HV+p|TsHHVFLvlfk`HXki6%WCgk%t8WEsr}YX zMV9n64?}Bqw+8%)oUnJnq~uMA?1mujNT4ZB*aSHugxkivTq0PgJK9GUNbR>Nt7zKS z1;(X<-~GZ4a-e0lx;EvXapew9_eIz|h4z+MGgEJtx(9}tr}N3e>w4MIqdIh*-eyGlxC>K9W10_KEI1od zd{zzkqm)}ZC#7}tRNA}|vk|%QVIhPI7F8vbijTuglr^lG6ZY!von!~QlVc($8$^#v zqt2f1!G$y|#H&yRUa1TD%JyS%ku8r+GF}5Hu+|dTub+^jzv-<0+=Duseo^Z55Z+mlG z0^Qfeld;E~7u84*bsh=)q&=E5_*uF+CBl?J2eOJeqy8hW$g~X~KaTVw1>5;Hf=G>G zwUI%--Uhpa!NSJOdvqy4Ej;rDiM&I441Np*^JQ*f_cGJxBwqAdoOq2zAW<$R4-7yF z&vHp+AdCoU10(fhvv*(P7Fb!EJQWlI?^p%D-|`e|*xRw`e>lsy_#VL|c8S5`#1^|t zKVQb)+D$u;723*mm5O~>+Ifl3OP!#xYXF=&@?J^V^rm`EQ(4*Ei-)(dQjqN<6wR14 zg#t7+7rx;QexbO&+6a$>Zb$oFDEp`@#EN?(I18T)C~K095_e-wnwsmmVs+!K`QbTs z+X<+wp@4{E4D>NFC)Q*hN?{ z7b&BR5qrpQSs|(GsuCrc>9Xs{aUaxsC`5Jz^ON@CQYT@1E7xjGmV*t6w?D^hQQ{Dt z-9}_Fwfux#32Cl)+a4IT_H@@n)17ULu?!3jP_aTCvF_Ool{|nS;UeLI!YL` z+;3aGNnuDQ^#QNOAKajWHyS<`4+{WF# z&CV+|KIntS$k%ZrL;yEyUM&a)Nby^*(ojg^07J$(=2Sim1d&qdEp#fEcJ!JR-&$5} z*y64VZG|*jE}f=3L?3~zG>{=sp>a1=yD~gX zlc>fQ?0xRDUZ2K3;R3Z9rM^oJTi;-P&0RFfWu&Sq$)qwKR0F{rmv{}-qmQQ5hMR>> zSk2Vzr(mbVchJR=RI_fQ;|XG_fl~KwwUe>3D3ndqSM!t$rs26?FQ1kI zQ_OpMuM?iN491<>6$|bzU6h`intHbmtrlR!G#=*guh3*T_%i4&(j393l)!DINs{?Y8|p%V$>3NA*~{sqV@ix{6?|B zJ%MmWGbeE67OvV?92w#B-RKU%%b^gHm_qX^AFxDTiR#5{3_{g!fwV@I!t=}v|7Y_OuuBR^^)RGsQXo-{>yxPA^S4?ghM0T(zU>OZ9xiL7rMYwy+ zyx7TdJmYk}JgM25XPf_ZgQ7d|m_hx`sZmJi7ix>WNRf*IPClk>y%L`VCSE(O`PUCZ`1ylJpm6xMP? z^b1v@B(?-<_OA=<+>7lCTALyn?Ci7!noJF^RE@JePhYkcl9LG`F6lb z@Zk-X>{aJgIv*WdYsVB&sR;(|1T{iMZ|l1!e=glk7j z9}NkX(}q0Qxo((g>BCifdxL2z3jyW((v;h`k!@r0Hrqgyuq#l3rCUwLbPjh8<61}4 z)*9{}(!FZCEUd3b(d+r^?>NWtxGjglNSfs99zQ2$(FR+4ZY_Irynoc}Y*el{rtmg* zC?2C;d`HRWbC{|{Dom+CZPE9}l4w-IM){p1RaZe^j8Yj5(!I!-Ni8O+iu|{Jb1b91 zjz>EPW7lqfBGEUBiBHtQ`H7d8MuyG$q2Fg1A^pjH7FVIA-1%L@f*w)^Ptw671(4R| zeOJM!&B8Evfr81eR#pnUPFRaQt3hif%D8hmIpvJb=9EIlZaQWcmRA%ol&KGm2Bit) zG9HVK`Wd9@YVWIixLAs?Kbu$J#R+AkTFRGG0-kM{iQ%Qr!HYcSPJB6x5~$>bX@4TW z`jI=q znTQRzycZW3XgnwOWKuQgcFvX@xJQV9rdW$&COG0eD@eG6Urt8J{8-DO8GcDIy(#5U z9QpP8q-#IOm1tgOy5#9$jYLo-(hn@aqt*%U-wId0gwc57P3u~K@cHE;_#a`ksu^ynnKBn{=*$at*+{S4E(o$RP2jRf@|U5$d6#2__oC>><6`~>|36T=MC)Hck} zINjtm^&C7ojl}4nRE+{T1*P;3jJQ;z0(lYh-01MQGmebt8MmCSW3x&P0Bz# zfa`zlR5rG9uyfV@e+>$IS_c=0aG9}jsR;@ZuQgHoIZ0$DgT*;VV+Afn3EusDzaO%C|~{Vhg8SWs3$sC=$JHdg`- zXu{JQcBVJ=3*hq07(!nG@3(&-=OMw%-*KP>kVDcL@v*kDA*ekkYpg* zdJ;>h=r+`-bs5&v`E+&EucQS|>+;LheL(@En=x8WFFz<;cD*2j#)fm?Kd=Oph?PkR zdJ{4{NS4%)wFt+je<$ch+^gnL87|NYqn^^sqP;j79lZcuw}cVxSnroBS2$YB@%ah0 zdprx_9sLC@%OEq#R0;wtu6!L2IFjq1hsdEtVqc0I4nVSZ zF~SLFbw{#Bj#@3YY|S?&2j4+S1ac(j<8-_#KHyoy3j_!1ioOdl$MznRhHB-NVj zxAU(ZwEc8AS*5vEHA}lRsx50M2$>lU~Y#*=N90KS4K)?T@3Yz1 zJ(1g*k)fbFP=SKhWsUF)&0|IdFWl192$EF8l?HRw;RElhpsXq^x9sv- z=|vt#TQxzOg-fHp0r|zAo$ez>Dl(#IUKrqIONl%j4t*9xflc)5xG=SR@_0sdrSVQ8Sk)du1F~HErDYJmE-$3e7 z>*p!Kz1wddg>tc`aW-YA3 z$a&m@g^^G$t&h&q4H`Y{uVHFVA&M`C+9(%N2Kk^JGndvYk9%v7l_l%=Ex7kH-1Du| z-QZVkW;XE$j!yYuD{LY44y(7_63+f0L?x?$7m~VB`ExKY@oxK4_A*jOx~Uq?oem}}uh<)EB0vz;d3^rd z8$TI=5Njt|S53x7Nrznt$L>4chV%*gCYEa!VltRO>w`s~_{DVgh@yOyuJL(xIRc!_ zIZphwfztr4w+00vJhi_VNnFjI@WgByGL>%fWzreK(|8{RwAQIR)0FVz#AXBf2M_)zD7mAH ziC^m3K;5U9uT~YRp{z6VW3E7=JdF2I{XcnGhTqT(#T9*T-AtnORhAT|9n!Na*R7ji zubaPOc5aE@v(K$-gm$ua1Ik@N1I7TG?2P5TecD7tQQu3t1Uc^o^;BpFLzklvn7Y>K zF|Cm@T#A7_k^ytaoVfEo$=aNuDHp@LlfH zQo=_*?OOzK_j|ifg@%9~L2K41*J}$|Iks&-=lV|h+1C`zk?b07_!wC5*W}bAA*>Q!;0j5c zG$@gY#vg>YTLb$~@P02bIc{}n)Et$|&L!i<(jZebwGIb@Y%Nc23)FB82|b+c)O_bF zjLO%$kcoa0#zogTTw1Y-IcJ1XyBbk51QV^JS~lUD-GqBxuqyUlL8*z=2ab=(+u=5b zx2*d%2XC#w*zL_mWK-cj)?OjoLY#@J%Lmf{E-ve~%Xco}w9*d}($0(34nBAGA;pLO z9Ii)n<#aq73qg2RYMpX~b_O*)Z6f+N`y9G^S9oQ75S2<3=$naNZm`F@b<=&47Ragg;w@Z$A02 z?$$NADL9{WC5{;h}eyNdMhzI<`Uav`1JB4E9s1I{nMM4ErJ24BU$at!T^0ac>`H`d75<<+cS zY?`+0oL&b80=fkhqW@2u%#8kGlc}+luD!AG$5)&eA7A=GQG5e11OY1(=MRGbuMNIF zCV$}o1Vc^CO&#rwUwi3gcEr4MTK<4jvZ8=QJ@$W)5JPgB<^bUX0g?U(NZ$G{AbT4Nz{)psu(Yt(b+86h*f#xJ<36(D>@Wb9QnK*Bj`9A1 zZSYlg{RgI{(LVuS{6@FvCiMoeqFFd!`@cWp?}32%zMAaI>f};4RN)4AjGpqf*AUYF z%5l&)w=)FDF1-T1UZjBk>_1zBZ$RN+K>sR4{!eT5#V(bxCR!MP!yx4T^2d%wJ1?{-QE{{|D+HLZH8+{!YU3 z7pmd#Pt<>P-rosY{-T_`ru>Jz06!_sNTYOBB2YzCI}a3)ln<{9m&n{|5efZ2UF(^}g<3WJBct zPX1$;_iN(ohrz#yK7{|B_^(I9uX(StV!wF(H2hv;FBze6%dvH{U2Vs B-(mm& literal 0 HcmV?d00001 diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index 362e4e9f..4fb6641f 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -10,7 +10,7 @@ from io import TextIOWrapper from langchain.document_loaders.helpers import detect_file_encodings # from langchain.text_splitter import MarkdownHeaderTextSplitter -from splitter_utils import MarkdownHeaderTextSplitter +# from splitter_utils import MarkdownHeaderTextSplitter logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -444,9 +444,31 @@ def load(self) -> List[Document]: # chunk_size=chunk_size, chunk_overlap=chunk_overlap # ) - # # Split # splits = text_splitter.split_documents(md_header_splits) # logger.info("splits: %s", splits) + # from typing import Generator + # import itertools + # from langchain.text_splitter import RecursiveCharacterTextSplitter + # def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overlap: int = 30) -> Generator[Document, None, None]: + # text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + # for document in content: + # splits = text_splitter.split_documents([document]) + # # list of Document objects + # for split in splits: + # yield split + # def batch_generator(generator, batch_size): + # while True: + # batch = list(itertools.islice(generator, batch_size)) + # if not batch: + # break + # yield batch + + # generator = chunk_generator(md_header_splits, ) + # batches = batch_generator(generator, batch_size=10) + # logger.info("current batch size: {} and next batch size: {}".format(len(next(batches)), len(next(batches)))) + # # note: typeof(batch)->list[Document], sizeof(batch)=batch_size + # for batch in batches: + # logger.info("batch: %s", batch) # TODO: Local debug CSV loader, remove it before release diff --git a/src/scripts/dep/setup.py b/src/scripts/dep/setup.py index c57b08a9..b87860d6 100644 --- a/src/scripts/dep/setup.py +++ b/src/scripts/dep/setup.py @@ -12,5 +12,6 @@ 'requests_aws4auth', 'unstructured', 'boto3', + 'nougat-ocr', ], ) \ No newline at end of file diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index c0a3a294..34c18774 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -9,12 +9,16 @@ from typing import Generator, Any, Dict, Iterable, List, Optional, Tuple from bs4 import BeautifulSoup from langchain.document_loaders import PDFMinerPDFasHTMLLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from langchain.vectorstores import OpenSearchVectorSearch from opensearchpy import RequestsHttpConnection from awsglue.utils import getResolvedOptions -from llm_bot_dep import sm_utils, aos_utils, enhance_utils, loader_utils +from llm_bot_dep import sm_utils +from llm_bot_dep.loader_utils import NougatPDFLoader +from llm_bot_dep.splitter_utils import MarkdownHeaderTextSplitter + from requests_aws4auth import AWS4Auth logger = logging.getLogger() @@ -294,12 +298,19 @@ def process_pdf(pdf: bytes, **kwargs): # download to local for futher processing s3.download_file(Bucket=bucket, Key=key, Filename=local_path) # TODO, will be deprecated and replaced by nougat class in loader_utils - loader = PDFMinerPDFasHTMLLoader(local_path) + # loader = PDFMinerPDFasHTMLLoader(local_path) # entire PDF is loaded as a single Document - file_content = loader.load()[0].page_content - res = parse_pdf_to_json(file_content) - logger.info("PDF file processed successfully, with result: %s", res) - return res + # file_content = loader.load()[0].page_content + # res = parse_pdf_to_json(file_content) + + loader = NougatPDFLoader(local_path) + data = loader.load() + logger.info("raw data: %s", data) + markdown_splitter = MarkdownHeaderTextSplitter() + md_header_splits = markdown_splitter.split_text(data[0]) + for i, doc in enumerate(md_header_splits): + logger.info("PDF file processed successfully, with content of chunk %s: %s", i, doc) + return md_header_splits def process_image(image: bytes): logger.info("Processing image file...") @@ -312,8 +323,9 @@ def cb_process_object(file_type: str, file_content, **kwargs): elif file_type == 'html': process_html(file_content, **kwargs) elif file_type == 'pdf': - res = post_process_pdf(process_pdf(file_content, **kwargs)) - split_chunk(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index') + # res = post_process_pdf(process_pdf(file_content, **kwargs)) + res = process_pdf(file_content, **kwargs) + aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index') elif file_type == 'image': process_image(file_content, **kwargs) return res @@ -352,26 +364,46 @@ def batch_generator(generator, batch_size): break yield batch -def split_chunk(content: List[Document], embeddingModelEndpoint: str, aosEndpoint: str, index_name: str, chunk_size: int = 1000) -> List[Document]: +def aos_injection(content: List[Document], embeddingModelEndpoint: str, aosEndpoint: str, index_name: str, chunk_size: int = 500) -> List[Document]: + + """ + This function includes the following steps: + 1. split the document into chunks with chunk size to fit the embedding model, note the document is already splited by title/subtitle to form sementic chunks approximately; + 2. call the embedding model to get the embeddings for each chunk; + 3. call the AOS to index the chunk with the embeddings; + Parameters: + content (list): A list of Document objects, each representing a semantically grouped section of the PDF file. Each Document object contains a metadata dictionary with details about the heading hierarchy etc. + embeddingModelEndpoint (str): The endpoint of the embedding model. + aosEndpoint (str): The endpoint of the AOS. + index_name (str): The name of the index to be created in the AOS. + chunk_size (int): The size of each chunk to be indexed in the AOS. + + Returns: + + Note: + """ + # This function includes the following steps: + # 1. split the document into chunks with chunk size to fit the embedding model, note the document is already splited by title/subtitle to form sementic chunks approximately; + # 2. call the embedding model to get the embeddings for each chunk; + # 3. call the AOS to index the chunk with the embeddings; embeddings = sm_utils.create_sagemaker_embeddings_from_js_model(embeddingModelEndpoint, region) - def chunk_generator(content: List[Document], chunk_size: int = 1000): - # iterate documents list and split per document with chunk size - for i in range(0, len(content)): - # TODO, split the document into chunks, will be deprecated and replaced by the ASK model directly - chunks = [content[i].page_content[j:j+chunk_size] for j in range(0, len(content[i].page_content), chunk_size)] - # create a new document for each chunk - for chunk in chunks: - metadata = content[i].metadata - doc = Document(page_content=chunk, metadata=metadata) - yield doc - - generator = chunk_generator(content, ) + def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overlap: int = 30) -> Generator[Document, None, None]: + text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + for document in content: + splits = text_splitter.split_documents([document]) + # list of Document objects + for split in splits: + yield split + + generator = chunk_generator(content, chunk_size=chunk_size) batches = batch_generator(generator, batch_size=10) + # note: typeof(batch)->list[Document], sizeof(batches)=batch_size for batch in batches: if len(batch) == 0: continue logger.info("Adding documents %s to OpenSearch index...", batch) + # TODO, parse the metadata to embed with different index docsearch = OpenSearchVectorSearch( index_name=index_name, embedding_function=embeddings, From 9299aa3eb26334c441245a7f70e805d124dd3a00 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Fri, 3 Nov 2023 03:51:23 +0000 Subject: [PATCH 14/21] fix: fix glue No space left on device issue --- src/etl-stack.ts | 9 ++++++--- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 16522 -> 16515 bytes src/scripts/dep/setup.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 40f25e6f..33b9d6ab 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -50,7 +50,7 @@ export class EtlStack extends NestedStack { }); // Assemble the extra python files list using _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl') and _S3Bucket.s3UrlForObject('nougat_ocr-0.1.17-py3-none-any.whl') and convert to string - const extraPythonFilesList = [_S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl'), _S3Bucket.s3UrlForObject('nougat_ocr-0.1.17-py3-none-any.whl')].join(','); + const extraPythonFilesList = [_S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl')].join(','); // Creata glue job to process files speicified in s3 bucket and prefix const glueJob = new glue.Job(this, 'PythonShellJob', { @@ -62,8 +62,11 @@ export class EtlStack extends NestedStack { // extraPythonFiles: [glue.Code.fromAsset(path.join(__dirname, 'scripts/llm_bot_dep-0.1.0-py3-none-any.whl'))], // extraPythonFiles: [extraPythonFiles], }), + // Worker Type is not supported for Job Command pythonshell and Both workerType and workerCount must be set... + // workerType: glue.WorkerType.G_2X, + // workerCount: 2, maxConcurrentRuns: 200, - maxRetries: 3, + maxRetries: 1, connections: [connection], maxCapacity: 1, defaultArguments: { @@ -73,7 +76,7 @@ export class EtlStack extends NestedStack { '--REGION': props._region, '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--DOC_INDEX_TABLE': 'chatbot-index', - '--additional-python-modules': 'pdfminer.six==20221105,gremlinpython==3.7.0,langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,nougat-ocr==0.1.17,openai==0.28.1', + '--additional-python-modules': 'langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,openai==0.28.1,nougat-ocr==0.1.17,pyOpenSSL==23.3.0', // add multiple extra python files '--extra-py-files': extraPythonFilesList } diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index c655f299af7fde8f8034c6ec8bf46dc2b55c78e4..47ba1d0c008bf739472cc4173c1db4035e7ae328 100644 GIT binary patch delta 5959 zcmV-N7r5w(fdPYo0kC%$4Ut}ASEQ;GZhI2|0HBkE7bJgcbK5wQ-~B64^rTEqB*wCn znVf4iSD84u_|~4JV((0CIVy#sAd720v;=8Kas1z}yYV1EQnusGai$y!XfzuALU#j9 zvQ<%5tmIpXd`pXqizL6$KSe!Tl|{^j*jkiD##T|aOp_T?-5=nm?5Nf&7^d3~leikN z<3zyU_p5&@De@>Cu%n8XQB{-!cFO-<^E~EqY#QYk@iI#Ca9+f9#`9{L7SWuSB3#WE z8t3nk;2#d(9#hY~17mTe-u3v`r+;o7D}X9)keF4d-+Dwuy`DpqMyRn2NnqVjTHT<7mNqDC@K z6@*5hM@;J~NyP)kZ7JD@vq&WI>mpwy7k zk1{@;23uPhucCQWMbnCBt2C-O47k~1@OW{NSI8Go8txr1Zxxl%1@Lg`4J5!4r~|~) zWdeU1Q^w0RK;7J`P9Uo;`7{OXg68vyYL1t6emPC%i14SMJOd2OMXkkV-FV5@*F`xO z_Ha>WSyZkS*zML(^H*i}ZxwH3&?+2_}CW3!0>O4lvVeme$K}a$1aL$svCE0Mb9f#mlqsgikvgxI+z&J3SurJP&LI@;b-%y5`v=G^Sn0_h-o0Exeqg^=~`-))*q&Z zhcp_LMI>lzQ3`bBC~UFLtLO_XC90Sg$We^sXBN@;TG!G7%h8qp{35jKr zSyaVKKnlK|Uj1^%-+me}KYSYONCb^}CJUAql^PS}V0r8}wtK+XHal97K;?ggg1{oZ z*a4_5k1ugJ!mLVul@zrwzy^@j=sY?W<1iMGShS9xDk8;s!iH=#v6iMEX&hrV>hwtY zjV%Ow691h{8_tnr}5=KoybX0_&Mmp8M1tUX(?>@<&17z-c;c zn7zP4-=@M)U|QR%>ij-GAWDBG{m|#b(d3*+*Z>e>J945Xoaj{ru4z_R+}a#W;Gs8_GxP@d8)ihBG|LqQB%OI^#xE&A z6U^7~^EopQLR^{E2 zTvQMP*z|p@!Avr~VSHRbGzENIL>PMlf&2@w5uIEeV>Ypx8HGZ-iMTfo@<7x8AzR>wPnsB^STKBS=20!a)SZXnb!7sm8@E5 zKZox4pmAxU25W`A)bQJI+HiQR{tX)GNVHwqU>X>(;cqA;M*wf6)J4B?z^auCqJFXB zak5B2i*3^|u^4~)mZ%DwHMmh6K9D#&*L|xI3Jo zG7VvE*$Vmrw8$W&Y9@;+-IsSiXby1Oja@$F^s&y3A9j4 z_R6ST5Jc5=~CDx*1-s3w9USa%v&csr49cM zR#7CU;+~S?Slb;AED-_!WoFXnvj^7NHIl{yww>dE$S3kJ@!81kJDJ>ZAdQzFQ}HA; z4G!2gLrWz5YTck#;Ap5tY5V)vAN9&ycaLb*Zr6Cptrq@|Tx#gLs7q;%7*&@ly|W~z zxha1W{*8;lJ+Xah24ox*J|;JI7H#Q@x@<8Bzzt?S}A zlhtmRM;CT!4iLizxTe&LfNp@(;*87;G35TRKB-l5D$DT@`$LvX3M&iEsaoif`VU0>m+G zW$ZI_eU|ZelJ{=%ddqqf#Y<++W59nD!MR{J)wRpw8pj|>fRONo6r4kc^gbr8m-A$9 z4#6;fodYTy0L-m(omRXj2MN5R0&T-IN~;Ypme&T%bdre7S0+AmxK(C=!q= zVNI-uh0R26gk!|`%040jgwQHoAJtINp(+l+kPt=vk?JsyT~8xpWTyVr^`8&L*Q-8% zEg`~&P!K7X#oz{v<{>d@co(6iWbeSSXLweK9v*}oky9}HK!1Kfu{wVuQ3s6U2IGUu zU5?5PI`Z$Ht94{LFe@AMu_v(o3x?8k;;*V`fCGOWxHk#zOs(<0LMw1UN=i}_Jf3j$ z##b{8*v~%`t2EV3Ohmk*p}UQzlV@s55Y3TVGgIp{Xd9vha1szrBoHJdw_ZEu(%pa( z0N`&Mq?728YrdgOjL*V5m1|do?%Z^#zuh6?7a%~&w2V1-3&$>vTI5gBr#n5#!gN0A z+7^I!*R##uCMCpis4SFc%E~dt(1K%9P!(lo)KcrK_pM89HlKf*ksCmymFv8*4+3@g za=@;TGXNbXkkAOf=M>W-2N-h6p7x^a&;_vGa08%C9l%{}z}W)xKIJY9oqpaakKOKa zV8{r&TF`f{yPt!dC2t11UmW;+mq;w~?h#9e&bvg@ruGi;^q)~bY9SAtv0%AXuww)9-=EX>;4v2l z;Jpjbx04fVdiD6bbof7TXiYvL>*ctNN1i7zk^?36^LBr++sQ?mAW64$6it>yu?*^N z?|7T;8n3O4J+BGv#sp+yI*j>s?A9wMU7WT%SM!UOU>cmLnYYv z%mDuKE*F2*mG@?9Iv1h{Pp>`q9P2GSsZRPz$c@X<+va?)MdDrM!~bZWerK=^BjYFq z<(k!0n`EoHTor;tPl9=qyjF}+jW?ZUQIb!m-T`#tqxUycJ;I#=y}>qwrpG)gMEKT@Y;@ z9z3dZpM5AH#>{$-jc1(jKQ-uJ2jO79)KKkJe@IrGu4YQg zhaX9LyQ5tVDH)q4UnPOBW(i$DbHMjTdm4ZIFRZT43(zO(CYe(rL>-5PJYfB!sQ#Nq z?vM8M$bV3s{V-W9INk)r@lq*U@+E9%k+`(@M!24jjBcEQE$OQLALib5pVlKXBQ18C zeC34e#YpK7`xP&D$7wC}G{20L?!aE1B^Aw2Vj2B9QYrx$X%Vs1=+Vp3OZZ1xq+@@u z=BtMB9Umt6!Il~)4;`x#>R|WFg zk>U*^b2v5!ring zn$eu-Zq|G%c+=S(OBtC9*Ymlc+T03lH>w%((XuLv^aqMX`82K^ z!Sq7U?3>z%n}H52#_ChB7rVPH#_FE^T}oojdJLX{JcOga6WdRHr@+5o%7HZ*deY`2 z$EF@+f#CB%ML5^5lc0rQd*kK379bKNujK2`+C-ZeeB_-7>bE2~!!Lh2ED4&of$Bou zIGcjJ)quA`+q2BnmqB!u&*)v4h%|}$2F&eMs}F`|&9^G{6dRsN{1leyb6E2?5&hlS|TAl%`YNt@4t)o&g3 zihYzSjtb*6!FYB@?Csf$dKJEg_~*GF1m_1YCk_$|-URSyw(3&RG!|EM znrIv^ec^F=O=pvK7(A=vOI{JPVPXkzz9UbIQ%BMpIeAc^PER9<5i&)%09mt(actU&U&fx-UT%=925<`MIi7)GmPSb^#d#r_b$ zhA&<&Up&w1f05F`)c*&wAuF5>5i1>IR}!W+P<{Xa0NMZm03?$yIvY#oS0Sq9QzrCD*b}C%?Em9<)yT ze!#0LL~8EOvtRt|9`v@%Byi25eTiiY33C~tHY!5MAwm0&$UTy%0Dk0xrfD2l5oHT% zOmVb+te(fRRNGLY26pZTP)iE~1QY-O00;mp9b=P*IhX@09b=OrI#LNM9b;FW;DGW! zlYBZZf6qyhqCfyZ(S4l-EarkmFOacNR6s@9WH6)*CL~HC!s)v=)uC;N7kKeMLN!P* z0U2yR0Vtc205EGusw-knlzs_cy5Y5zY|n$32HWpCq(w)Ay$AhsEw0?w2_uUyg5KOw z%m;eFV%&kAl&E{qGHcR00lHm}Mu^+Fn(IDqe`xJ${g%Wd?RRsJXsT9j7g7~c7$u&+%iJKQV z-Y-HVhCQC>i@d3m=mNtoVrgDm-w$eYX9cNp=&@MAoZ=PsxfbC#gVz@`Xq-3@_g*|n ze}ZyeHlel8qPVLpmdm^K!j?zA9W9C8xC6yXGV_df^$R#0!CC>5?_%F!Jd&HaJt~IT zny$Xj1NEFLu)=~C`-4aE%-26&i%LTNlX?A4jzg0K%cStqv!v}#Q66BWu!}oi12MER z-(0wdZoVHVNn)fYwQo9d?NicU;RD&2e_+EU5?eJ{B}Z{tUuB`zT;_19*%7*IPWaO`wyLgf)xWvt-Y^#SG)A#jPxfVD?g+-0`|)(HF-EJ?N6kDs4T@-6jLW;GJ zuFE>Xl)}Ar32p&`H;88BPJaAkD31mNWqwhsA!7k*MyHc=eI%=uChcLqo18x<%ZWZy z?0ergw6(8O)W>hL={zn04Ut}ASEQ;GZhI2|0HCuZJ`@2BD;;B35~em#egFUf+LJ{; zMFT4xW0Q?P8v=7VlQ23SlgK|211lY4liWWW0uDNpFghzMP)h*<6ay3h000O8D;;B3 po#24-KLP*%(gOehA^-pY0000000000v;kw2Pe3&W;yeHV001nhHv9kp delta 6032 zcmY*dbxhoCvt3*lch}-poC3ubDY{4(DPD?|7m7>ahr7GGLyHtEE`=7CrPyMN(*ng^ zZr_`H_vV}AWRl5wo}5X}%pVgHgc2QuQtw4c!meor`^}d1){hhfa+L>xXhA80vh+an ziw8G^SYyxcy56br%=c_VjVWRkk{6mZ90CEh;#mvx6LBR_PMk*Xorrj*L(cf8D}?Ds zA|@`xhr5M%MU!JY1 zcOWfEu?f*1)K9t$T~8&EbHA~Onm2F)ab&#v)MW)KMkSDX>&o3Ag6LHO$3lS1GQP@? zE^^-uYhHQxjaoaLRls>)-MG`7`*I1CK=hU2$rZoZxFzx1_JzDVee%1SrPO$ z*Jmn~Ja*~ZsAXPs>DvDis|d^bl1fQQXMP$hye>sWHG^x#&!QtT2t!uHRFMQ1FPwFX zlSJAxm477^<``DyE}LU4504O`Nril~vxlO(+i<>b7#`Wf<7Ma0qO_q+H`vE#I%TI# zpxzUm2rI82!;@iWbposvF*QT_R^;xARP+j5Ctllpr6{qT{9IQzpik@#MUl5Odn3*2 z!Jz7!Kb;9146DR2I19ch<_(@shXkp*&huvhN*6gX1vIAlakj;vd0_ z_UIISMrLWht7}hIGQ~f7PHvC{hIf=GV|nDi{-S#sfnF4AO$&_4_DL|s5j(U$_?07I z@UNA=N3@)z4~`;L1E@O&&_Y5vDs+kYUeaG$R_h!h^ZhvTVkb+43c3JrrPqRIhG_|v z+>iy!pRE_#w~f0LBpN=;Qr`ZEq1p$B*O(vNzR*V7iO>!)jVctI^d4^P)VD6i=S_I% zzf_JrtS@250ra&vv3X}=4nvksZ`cV`GClh0HabZ#7eI};c`Bn;bVH~lF>@z+kWz_+ z#XvB}+{`&^ob_1vCkC#bhA<0Y19b8&@1J9O_~FZUQhf+)=)#BBCuU$wePT8K`g}=b zmi7t~k_2-EsL%;=%Ll?233<;ox$!Z|o2#lP{+tm5(C7Q%vW)l~yGs2Ofkd0(?~%^5 zW%_Rp$(NMJj#6QiP@||B${ps;TdNJ$EX9b{m%iVSexqHy*(ZQCX6r44(p^;o* zLsu1uoFq6b7~Zfzpsvat=*}T5(TB9*$Q?>xJ8$cUSPk|B`elqpb)O14FJeE#ILPcS&!WeEsWMNb1Q)xo4$^?l(%Myb3nl^g1Dq>BTO zHKALBs(wxBhG||g!cYZ#3;0z7s<7Hk`pFqxG1vO$^Zo@=>@D-axV^1v$sLkWT$bpm z_Y|*nFz#&ZS-1zj5u-i3Tukuq+RFwf3$zW&jah%??Da!$+lXx1lER!&Nk&Er&H{d9 zH<4DGw;*!f(Vw&38>6{(>Jc9%Dp4ihd)@I}?f9!o?0tP#nH@AsZA`mE_q;A{-P~tz z2l`GhPzt^zTesBpp5eK)=9 z3(L@2YMNkMignJZ<+l_E{Q7=+NZ~#QpF_t+oEe} zNKm}fB9o0DQgE4eOeYJozQrB<0Lrc)(w*zrm5!>&ULmsfah0u0z^k@lI0u-auFuf0 z>+%x<%C1}U)0SNC@!Zs1guBG|om%;Z1q;hR^M*pwnwGw_L+N4{CK1$pkhcEd+ zn-D@a3H8H6s#9ZbZo43Te0*)6eXAEsz(b zE6UpwL!{)qO>&g={oXh9!}4g+5yGa!PUi>~p|1mxmH`Qi#mW<>i#<1;WLH@ZAuK<} zuv5}vA^5D$E~wzy*Q~=HmLlPtZo4=73M(PVGc3aUO!f zR8C)i~ zwhWlZiA#PdlzF9Zi2&j@{XHH+ZMKW>MHkmfG_`nfi^*P-qjVP-?v9iKZ8V3m zjWHh{ccBsmdKr6iSaH?}nS{5MhQpTSK}Kh4NzWcJE(wB6uvFXE6GJCc-u&?W$x)Db z!qv-YtYR#gky~p`t1|+1skXfdyBhxfu!ALF*S93srx-Ya1MJCGs&VK~?N4vM&$?^x zm$cFjou5~?b4^?`b#W{Fr2DES({@VySaE3xZlrLlU1msLT%YtVLxF6~v* zMipq{vCOBqo^_BzWbi$4i-0<&B@sSo3yCVehy2;%d(41;aiE z#>JWZ+#9-Ow}Y@b?=i;G{)v%IU#*Fq<{919kLZ<7CSjlVM=4u>6Fel0-);;a+d1-Vb2^a zHqF`FsImrhz<;nrMHsA=Gxt3yr5)t_1H9MHp$0p@mKzwJkGk=ecgK-f2GQ_Q7tv=` zH=4Zl(NS2P_!hU_9+WYvN*B5@x5JMRimiw40jFxE3UB+78#duw%r8=#Ww_Ou9pC(D zaW@MhQN3ZJjz%wiWk~wl(o!ag7D*-4-Ix#wVs3ghuOh1NUKy<@$fg7ZN~OlE=k-RZ zvZ{dQhh4=JkLpQAp2&J;GNln4p*GACA5{&SRfrwe;tQ7pu6*+12rOomEwVA$sXRs< zpf+B07G<+-=(^A~qr~+g{T{LFz?XG7K5}b%fb;$-xJV@Ls)n0pz1N)eilHyBN_6$L z9Xf%kn!H2schdOPYlm>zyAu_wwDR`&s7RQId)=k`sfH#_v8-;5twS#6pNU1--6xljg; zx#zldui!Atp5SmopdMfPdM+T^0woC2!cH}eB+IFcT{Lh(o}`kR>LPjYq44} z7FGnGe6igyZ$^MmZ3RQSo$&B-t8d;qw77~vEBHea1zDA`vY+d~?7zly!pi_I+w4ntv5sI}P>Ol0x2CvlC#4-H$|J;A)VI#p)N7vE zv~BF|MWNaA(WO#8@coON(pnY6Mx1;3b!1qvu@Rr+}~)TKdnNUJ0z`Yd>XUjWn^@{ z?>PLS8X;hc;+=zvx*l`;nrjN~gJ^2FZqX`A&ul^wDT})$4mUk})I#_rT4)#-nYR>o znm7v>Nw#7Wo@B+S8wsf}Y!P|)--cpBOHw~Tl)jVck3je%`U!B|a1C#vWW_+yj zY0v8;h|N{O;he86USczW4N6hc`C5(TY&BcOI&|)f#BIEnx=@~qN?2XfxWM+pJ;!`O zl+AEfnFteflNR8$3K(33ih0Lr{feo&7^a1OCpIgPiEoU-B?Tr)iL@1F52psq#?i@r z8n}X0Jms4_Qqv&am7I0ohnS_S-5AR?_kJBQ`h19ryF0F&K)Wmoq!Nhxg3Vb{MX(O- znq0)md$X+;(9qYid(7#&U{n+@OgPQ{C@?7~uLti5R2lH!nuj(dI?8Vb?i8%s7fH&> z3gL5yHcbI_@zhj8+2r%fa6cN|=VDPkuTag$t=a>30{;+R0ixH8hNh-lh1#>4&nuK7r3l=7 zdT&k49tq>5sTlT>o=TbCF@BS&_gnnK8Iq8|(Vb z|4HSovhjh);|R`tOwkPErLfsiK1bRV{9Z)A9nH^!CPU<9l+E>-P8GUEDP*innb4fT zy#M743qT(tx+W6)=UT9j(AObX+MO4#Ubu6V+;py)85f@{6W46A1MPEq>mGL3*fdOV znoXemIa7uR{O8sV_ty9p*YkynN)Km}N@R6za8RM1mVGg}Nqaygz!sU$sf$BG_-%o- zNQ;u)jQoeAO0;<7tb9zj3$79yo`>}Zl{HNIR^TY{h~VUQ`k%seGGTUa&Oc&QWEB1o zcg_0fN~ypVAL&0^$A8xI=#ITtw?}QH6N3DsryN~;ytUs4Vd$jGcx0t~`&ojI{>C^QB%3=6U`JSb1Te?0i_WO5m5X zTL3g_Q}-Rzo0p&hT`}7*p5v#dLHjT4yCSHj@H(G~y@;QLv`4wZ7^qY|VklatY>VCr z`W17}W7|%Y|HzrKWmGl|$*iB%_TF`yej|o*cmJO4j0s;#OU&fkP zb^MXJnNK+%-WcxS;Lz5Rkx@zy$_r|*0iEpblTns_)<8N&0{M6HF0#Io9}*3ZpK+?7 z8Bm6myM2Iggbr2Bz-Tk(-B;}*TD~|0469p-Y4D;2#I#J!yGQbf05mR>AxCok{upH-)o)9%2k zH9HyVhLs+Z;9Hy4Nma!{desdhku+G!;Fl|mp(R5UZ1u!@CyV+XF&5zm^0UZ6WBg(d zX-@eI%nb{WMF^z;Od!AFXOG&x#jhvSFU&l-V3JKeD;U_5I5rdn0-gQ^8z=?(@)=+{ z`GTZh5FawVTJr;QOD&LFl#h(;yq}^W;yQ$laNy_z1bI6Cb-W|7SOwBXs?O_^N8l+0 z(`eGV%U_3*NgJH)F7kVdiTk$xU~`Et?)ujGHBcx)huenvEoEVV%10|Q6c_?Kn3nw` zaYC#uqZ$Pos_me!rRY>0DZ+ zr-(|boiNu6=1@-k2_w4nn4>%fvl|bLvq%L;?4Fxs=;`43d)czx8y2#y+GVXvcyMY( zPaNky8?O76x7rvi)&_)p*?cDN`g{Y+&w$W0-^}uPsWcDG-Cf4&nva^jspn^}dMl+k z;1fjR#Q?&OmpJ}gT(JZ{&=RgXb_!o5EEXOzZXVyzq+F~LrL-tn<_(9XH=YL*tk6p2 z^_ro*T1WP7e>7hF={v`pvyksotMH&}S3j7Xgpo2CxqK?>%2Zi&{xYj|@?PY^r)}J^ zr;StjO}Rm2v#;seAi<9)H-dw>E796`(h|60v@eAgJQ65?n!Io;oSqyv>pDrd%nFeGq^HoAcIY~bTP#+MCgeiZP39k##F%aWmh|7%!MWM4v< z{;d?12_3!pYtw(h{1^X;rXU~5e{CrUQ-we@bktE$$aLV Date: Sat, 4 Nov 2023 18:36:54 +0000 Subject: [PATCH 15/21] chore: reorganize loader utils. --- src/panel/app.py | 2 + src/scripts/dep/llm_bot_dep/loader_utils.py | 309 ------------ .../dep/llm_bot_dep/loaders/__init__.py | 0 src/scripts/dep/llm_bot_dep/loaders/auto.py | 22 + src/scripts/dep/llm_bot_dep/loaders/csv.py | 172 +++++++ src/scripts/dep/llm_bot_dep/loaders/html.py | 42 ++ src/scripts/dep/llm_bot_dep/loaders/image.py | 5 + src/scripts/dep/llm_bot_dep/loaders/pdf.py | 458 ++++++++++++++++++ src/scripts/dep/llm_bot_dep/loaders/text.py | 12 + src/scripts/dep/setup.py | 2 +- src/scripts/glue-job-script.py | 325 +------------ 11 files changed, 722 insertions(+), 627 deletions(-) create mode 100644 src/scripts/dep/llm_bot_dep/loaders/__init__.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/auto.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/csv.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/html.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/image.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/pdf.py create mode 100644 src/scripts/dep/llm_bot_dep/loaders/text.py diff --git a/src/panel/app.py b/src/panel/app.py index 4834a7b6..a0cf9e3d 100644 --- a/src/panel/app.py +++ b/src/panel/app.py @@ -117,6 +117,8 @@ def pipeline_tab(): request_body = { 'aos_index': 'chatbot-index', + 'operation': 'match_all', + 'body': {}, 'query': { 'operation': 'match_all', 'match_all': {} diff --git a/src/scripts/dep/llm_bot_dep/loader_utils.py b/src/scripts/dep/llm_bot_dep/loader_utils.py index 4fb6641f..d956ec19 100644 --- a/src/scripts/dep/llm_bot_dep/loader_utils.py +++ b/src/scripts/dep/llm_bot_dep/loader_utils.py @@ -4,324 +4,15 @@ from pathlib import Path from typing import Dict, List, Optional, Iterator, Sequence from langchain.document_loaders.pdf import BasePDFLoader -from langchain.document_loaders.csv_loader import CSVLoader from langchain.docstore.document import Document import csv from io import TextIOWrapper -from langchain.document_loaders.helpers import detect_file_encodings # from langchain.text_splitter import MarkdownHeaderTextSplitter # from splitter_utils import MarkdownHeaderTextSplitter logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -metadata_template = { - "content_type": "paragraph", - "heading_hierarchy": {}, - "figure_list": [], - "chunk_id": "$$", - "file_path": "", - "keywords": [], - "summary": "", -} - -class NestedDict(dict): - def __missing__(self, key): - self[key] = NestedDict() - return self[key] - -# TODO, this function is duplicated in splitter_utils.py, need to merge to one place -def extract_headings(md_content): - """Extract headings hierarchically from Markdown content. - Consider alternate syntax that "any number of == characters for heading level 1 or -- characters for heading level 2." - See https://www.markdownguide.org/basic-syntax/ - Args: - md_content (str): Markdown content. - Returns: - NestedDict: A nested dictionary containing the headings. Sample output: - { - 'Title 1': { - 'Subtitle 1.1': {}, - 'Subtitle 1.2': {} - }, - 'Title 2': { - 'Subtitle 2.1': {} - } - } - """ - headings = NestedDict() - current_heads = [headings] - lines = md_content.strip().split('\n') - - for i, line in enumerate(lines): - match = re.match(r'(#+) (.+)', line) - if not match and i > 0: # If the line is not a heading, check if the previous line is a heading using alternate syntax - if re.match(r'=+', lines[i - 1]): - level = 1 - title = lines[i - 2] - elif re.match(r'-+', lines[i - 1]): - level = 2 - title = lines[i - 2] - else: - continue - elif match: - level = len(match.group(1)) - title = match.group(2) - else: - continue - - current_heads = current_heads[:level] - current_heads[-1][title] - current_heads.append(current_heads[-1][title]) - - return headings - -class NougatPDFLoader(BasePDFLoader): - """A PDF loader class for converting PDF files to MMD. - - This class leverages the `nougat` library to perform the conversion from PDF to HTML. - It inherits from `BasePDFLoader` and extends its functionality to utilize the `nougat` library. - TODO, the load_and_split method need to be implemented and default is RecursiveCharacterTextSplitter - Attributes: - file_path (str): The path to the PDF file to be loaded. - headers (Optional[Dict]): Optional headers to be used when loading the PDF. - - Raises: - ImportError: If the `nougat` library is not installed. - RuntimeError: If the `nougat` command fails to execute successfully. - """ - - def __init__(self, file_path: str, *, headers: Optional[Dict] = None): - """Initialize with a file path.""" - try: - import nougat - except ImportError: - raise ImportError( - "Please install nougat to use NougatPDFLoader. " - "You can install it with `pip install nougat`." - ) - - super().__init__(file_path, headers=headers) - - def nougat(self, file_path: Path) -> str: - """Executes the `nougat` command to convert the specified PDF file to Markdown format. - - Args: - file_path (Path): The path to the PDF file to be converted. - - Returns: - str: The Markdown content resulting from the `nougat` conversion. - """ - # nougat ./paperSnapshot.pdf --full-precision --markdown -m 0.1.0-base -o tmp --recompute - cli_command = ["nougat", str(file_path), "full-precision", "--markdown", "-m", "0.1.0-base", "-o", "tmp", "--recompute"] - - try: - result = subprocess.run( - cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - result.check_returncode() - return result.stdout - - except subprocess.CalledProcessError as e: - logging.error( - f"Nougat command failed with return code {e.returncode}: {e.stderr}" - ) - raise RuntimeError("Nougat command failed.") from e - - def load(self) -> List[Document]: - """Loads and processes the specified PDF file, converting it to a list of Document objects. - - Returns: - List[Document]: A list containing a single Document object with the processed content. - """ - return list(self.lazy_load()) - - def lazy_load(self) -> Iterator[Document]: - """Lazy load and process the specified PDF file, yielding Document objects. - - This method reads the PDF file, processes it using the `nougat` command, - reads the resulting Markdown content, and yields a Document object with the content. - """ - try: - file_path = self.file_path - # Call the method to run the Nougat OCR command - self.nougat(file_path) - - # Rest of your code for reading and processing the output - file_path = Path(file_path) - output_path = Path("tmp") / f"{file_path.stem}.mmd" - with output_path.open("r") as f: - content = f.read() - # consider math expressions are enclosed in \( and \) in Markdown - content = ( - content.replace(r"\(", "$") - .replace(r"\)", "$") - .replace(r"\[", "$$") - .replace(r"\]", "$$") - ) - logging.info("content: %s", content) - # extract headings hierarchically - headings = extract_headings(content) - - # assemble metadata from template - metadata = metadata_template - metadata["content_type"] = "paragraph" - metadata["heading_hierarchy"] = headings - metadata["chunk_id"] = "$$" - metadata["file_path"] = str(file_path) - # TODO, use PyMuPDF to detect image and figure list, but no link to the image for the extracted text - # metadata["figure_list"] = [] - - yield Document(page_content=content, metadata=metadata) - - except Exception as e: - logging.error(f"An error occurred while processing the PDF: {str(e)}") - -class CustomCSVLoader(CSVLoader): - """Load a `CSV` file into a list of Documents. - - Each document represents one row of the CSV file. The rows are converted into markdown format based on row_count. - - Output Example: - when row_count = 1, - page_document_1 contains: - |index|name| - |-|-| - |1|Demo1| - page_document_2 contains: - |index|name| - |-|-| - |2|Demo2| - - when row_count = 3, - page_document_1 contains: - |index|name| - |-|-| - |1|Demo1| - |2|Demo2| - |3|Demo3| - page_document_2 contains: - |index|name| - |-|-| - |4|Demo4| - |5|Demo5| - |6|Demo6| - """ - - def __init__( - self, - file_path: str, - source_column: Optional[str] = None, - metadata_columns: Sequence[str] = (), - csv_args: Optional[Dict] = None, - encoding: Optional[str] = None, - autodetect_encoding: bool = False, - row_count: int = 1 - ): - """ - - Args: - file_path: The path to the CSV file. - source_column: The name of the column in the CSV file to use as the source. - Optional. Defaults to None. - metadata_columns: A sequence of column names to use as metadata. Optional. - csv_args: A dictionary of arguments to pass to the csv.DictReader. - Optional. Defaults to None. - encoding: The encoding of the CSV file. Optional. Defaults to None. - autodetect_encoding: Whether to try to autodetect the file encoding. - row_count: How many row in a page document. - """ - self.row_number = row_count - super().__init__(file_path, source_column, metadata_columns, - csv_args, encoding, autodetect_encoding) - - def __read_file(self, csvfile: TextIOWrapper) -> List[Document]: - docs = [] - - csv_reader = csv.DictReader(csvfile, **self.csv_args) - counter = 0 - for i, row in enumerate(csv_reader): - # print(f"i: {i}") - # print(f"row: {row}") - try: - source = ( - row[self.source_column] - if self.source_column is not None - else self.file_path - ) - except KeyError: - raise ValueError( - f"Source column '{self.source_column}' not found in CSV file." - ) - counter += 1 - - if counter % self.row_number == 1: - # First row with header and separator - header = "|" - md_separator = "|" - row_content = "|" - for k, v in row.items(): - header += k + "|" - md_separator += "-|" - row_content += v + "|" - row_content += "\n" - elif counter % self.row_number == 0: - if 1 == self.row_number: - header = "|" - md_separator = "|" - row_content = "|" - for k, v in row.items(): - header += k + "|" - md_separator += "-|" - row_content += v + "|" - else: - for k, v in row.items(): - row_content += v + "|" - content = header + "\n" + md_separator + "\n" + row_content - print(f"markdown content: {content}") - - metadata = {"source": source, "row": i} - for col in self.metadata_columns: - try: - metadata[col] = row[col] - except KeyError: - raise ValueError( - f"Metadata column '{col}' not found in CSV file.") - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) - counter = 0 - else: - for k, v in row.items(): - row_content += v + "|" - row_content += "\n" - - return docs - - def load(self) -> List[Document]: - """Load data into document objects.""" - - docs = [] - try: - with open(self.file_path, newline="", encoding=self.encoding) as csvfile: - docs = self.__read_file(csvfile) - except UnicodeDecodeError as e: - if self.autodetect_encoding: - detected_encodings = detect_file_encodings(self.file_path) - for encoding in detected_encodings: - try: - with open( - self.file_path, newline="", encoding=encoding.encoding - ) as csvfile: - docs = self.__read_file(csvfile) - break - except UnicodeDecodeError: - continue - else: - raise RuntimeError(f"Error loading {self.file_path}") from e - except Exception as e: - raise RuntimeError(f"Error loading {self.file_path}") from e - - return docs # local debugging purpose # if __name__ == "__main__": diff --git a/src/scripts/dep/llm_bot_dep/loaders/__init__.py b/src/scripts/dep/llm_bot_dep/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/scripts/dep/llm_bot_dep/loaders/auto.py b/src/scripts/dep/llm_bot_dep/loaders/auto.py new file mode 100644 index 00000000..60eca5e9 --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/auto.py @@ -0,0 +1,22 @@ + + +from .text import process_text +from .csv import process_csv +from .html import process_html +from .pdf import process_pdf +from .image import process_image + +def cb_process_object(s3, file_type: str, file_content, **kwargs): + res = None + if file_type == 'text': + process_text(s3, file_content, **kwargs) + elif file_type == 'csv': + res = process_csv(s3, file_content, **kwargs) + elif file_type == 'html': + process_html(s3, file_content, **kwargs) + elif file_type == 'pdf': + # res = post_process_pdf(process_pdf(file_content, **kwargs)) + res = process_pdf(s3, file_content, **kwargs) + elif file_type == 'image': + process_image(s3, file_content, **kwargs) + return res \ No newline at end of file diff --git a/src/scripts/dep/llm_bot_dep/loaders/csv.py b/src/scripts/dep/llm_bot_dep/loaders/csv.py new file mode 100644 index 00000000..8df30dd3 --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/csv.py @@ -0,0 +1,172 @@ +import uuid +from datetime import datetime +from typing import Dict, List, Optional, Iterator, Sequence +from io import TextIOWrapper + +import csv + +from langchain.docstore.document import Document +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.document_loaders.helpers import detect_file_encodings + +class CustomCSVLoader(CSVLoader): + """Load a `CSV` file into a list of Documents. + + Each document represents one row of the CSV file. The rows are converted into markdown format based on row_count. + + Output Example: + when row_count = 1, + page_document_1 contains: + |index|name| + |-|-| + |1|Demo1| + page_document_2 contains: + |index|name| + |-|-| + |2|Demo2| + + when row_count = 3, + page_document_1 contains: + |index|name| + |-|-| + |1|Demo1| + |2|Demo2| + |3|Demo3| + page_document_2 contains: + |index|name| + |-|-| + |4|Demo4| + |5|Demo5| + |6|Demo6| + """ + + def __init__( + self, + file_path: str, + source_column: Optional[str] = None, + metadata_columns: Sequence[str] = (), + csv_args: Optional[Dict] = None, + encoding: Optional[str] = None, + autodetect_encoding: bool = False, + row_count: int = 1 + ): + """ + + Args: + file_path: The path to the CSV file. + source_column: The name of the column in the CSV file to use as the source. + Optional. Defaults to None. + metadata_columns: A sequence of column names to use as metadata. Optional. + csv_args: A dictionary of arguments to pass to the csv.DictReader. + Optional. Defaults to None. + encoding: The encoding of the CSV file. Optional. Defaults to None. + autodetect_encoding: Whether to try to autodetect the file encoding. + row_count: How many row in a page document. + """ + self.row_number = row_count + super().__init__(file_path, source_column, metadata_columns, + csv_args, encoding, autodetect_encoding) + + def __read_file(self, csvfile: TextIOWrapper) -> List[Document]: + docs = [] + + csv_reader = csv.DictReader(csvfile, **self.csv_args) + counter = 0 + for i, row in enumerate(csv_reader): + # print(f"i: {i}") + # print(f"row: {row}") + try: + source = ( + row[self.source_column] + if self.source_column is not None + else self.file_path + ) + except KeyError: + raise ValueError( + f"Source column '{self.source_column}' not found in CSV file." + ) + counter += 1 + + if counter % self.row_number == 1: + # First row with header and separator + header = "|" + md_separator = "|" + row_content = "|" + for k, v in row.items(): + header += k + "|" + md_separator += "-|" + row_content += v + "|" + row_content += "\n" + elif counter % self.row_number == 0: + if 1 == self.row_number: + header = "|" + md_separator = "|" + row_content = "|" + for k, v in row.items(): + header += k + "|" + md_separator += "-|" + row_content += v + "|" + else: + for k, v in row.items(): + row_content += v + "|" + content = header + "\n" + md_separator + "\n" + row_content + print(f"markdown content: {content}") + + metadata = {"source": source, "row": i} + for col in self.metadata_columns: + try: + metadata[col] = row[col] + except KeyError: + raise ValueError( + f"Metadata column '{col}' not found in CSV file.") + doc = Document(page_content=content, metadata=metadata) + docs.append(doc) + counter = 0 + else: + for k, v in row.items(): + row_content += v + "|" + row_content += "\n" + + return docs + + def load(self) -> List[Document]: + """Load data into document objects.""" + + docs = [] + try: + with open(self.file_path, newline="", encoding=self.encoding) as csvfile: + docs = self.__read_file(csvfile) + except UnicodeDecodeError as e: + if self.autodetect_encoding: + detected_encodings = detect_file_encodings(self.file_path) + for encoding in detected_encodings: + try: + with open( + self.file_path, newline="", encoding=encoding.encoding + ) as csvfile: + docs = self.__read_file(csvfile) + break + except UnicodeDecodeError: + continue + else: + raise RuntimeError(f"Error loading {self.file_path}") from e + except Exception as e: + raise RuntimeError(f"Error loading {self.file_path}") from e + + return docs + +def process_csv(s3, csv_content: str, **kwargs): + now = datetime.now() + timestamp_str = now.strftime("%Y%m%d%H%M%S") + random_uuid = str(uuid.uuid4())[:8] + bucket_name = kwargs['bucket'] + key = kwargs['key'] + row_count = kwargs['csv_row_count'] + local_path = f'/tmp/csv-{timestamp_str}-{random_uuid}.csv' + + s3.download_file(bucket_name, key, local_path) + loader = CustomCSVLoader(file_path=local_path, row_count=row_count) + data = loader.load() + + return data + diff --git a/src/scripts/dep/llm_bot_dep/loaders/html.py b/src/scripts/dep/llm_bot_dep/loaders/html.py new file mode 100644 index 00000000..55a4ce6d --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/html.py @@ -0,0 +1,42 @@ +import re + +def process_html(htmlstr: str): + logger.info("Processing HTML file...") + # filter out DOCTYPE + htmlstr = ' '.join(htmlstr.split()) + re_doctype = re.compile(r'', re.S) + s = re_doctype.sub('', htmlstr) + + # filter out CDATA + re_cdata = re.compile('//]*//\]\]>', re.I) + s = re_cdata.sub('', s) + + # filter out Script + re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) + s = re_script.sub('', s) + + # filter out style + re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) + s = re_style.sub('', s) + + # transfor br to \n + re_br = re.compile('') + s = re_br.sub('', s) + + # filter out HTML tags + re_h = re.compile('<\?[\w+[^>]*>') + s = re_h.sub('', s) + + # filter out HTML comments + re_comment = re.compile('') + s = re_comment.sub('', s) + + # remove extra blank lines + blank_line = re.compile('\n+') + s = blank_line.sub('', s) + + # remove hyperlinks + http_link = re.compile(r'(http://.+html)') + s = http_link.sub('', s) + + return s diff --git a/src/scripts/dep/llm_bot_dep/loaders/image.py b/src/scripts/dep/llm_bot_dep/loaders/image.py new file mode 100644 index 00000000..2eab80de --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/image.py @@ -0,0 +1,5 @@ + + +def process_image(image: bytes): + # TODO: Implement image processing with ASK API + pass diff --git a/src/scripts/dep/llm_bot_dep/loaders/pdf.py b/src/scripts/dep/llm_bot_dep/loaders/pdf.py new file mode 100644 index 00000000..c280afcb --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/pdf.py @@ -0,0 +1,458 @@ +import os +import re +import json +from bs4 import BeautifulSoup +import subprocess +from pathlib import Path +from typing import List, Dict, List, Optional, Iterator, Sequence + +from langchain.docstore.document import Document +from langchain.document_loaders import PDFMinerPDFasHTMLLoader + +from langchain.document_loaders.pdf import BasePDFLoader +from langchain.text_splitter import MarkdownHeaderTextSplitter + +metadata_template = { + "content_type": "paragraph", + "heading_hierarchy": {}, + "figure_list": [], + "chunk_id": "$$", + "file_path": "", + "keywords": [], + "summary": "", +} + +class NestedDict(dict): + def __missing__(self, key): + self[key] = NestedDict() + return self[key] + + +# rewrite this class to use the new TextSplitter for mmd type +class MarkdownHeaderTextSplitter: + # Place holder for now without parameters + def __init__(self) -> None: + pass + + def split_text(self, text: Document) -> List[Document]: + lines = text.page_content.strip().split('\n') + chunks = [] + current_chunk_content = [] + table_content = [] + inside_table = False + chunk_id = 1 # Initializing chunk_id + + for line in lines: + # Replace escaped characters for table markers + line = line.replace(r"\begin{table}", "\\begin{table}").replace(r"\end{table}", "\\end{table}") + if line.strip() == "\\begin{table}": + inside_table = True + continue # Skip this line + elif line.strip() == "\\end{table}": + inside_table = False + # Save table content as a separate document + if table_content: + metadata = text.metadata.copy() + metadata['content_type'] = 'table' + metadata['chunk_id'] = f"${chunk_id}" + chunks.append(Document(page_content='\n'.join(table_content), metadata=metadata)) + table_content = [] # Reset for the next table + continue # Skip this line + + if inside_table: + table_content.append(line) + elif line.startswith(('## ', ' ### ')): # Assuming these denote headings + # Save the current chunk if it exists + if current_chunk_content: + metadata = text.metadata.copy() + metadata['heading_hierarchy'] = extract_headings('\n'.join(current_chunk_content)) + metadata['chunk_id'] = f"${chunk_id}" + chunk_id += 1 # Increment chunk_id for the next chunk + chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) + current_chunk_content = [] # Reset for the next chunk + + if not inside_table: + current_chunk_content.append(line) + + # Save the last chunk if it exists + if current_chunk_content: + metadata = text.metadata.copy() + metadata['heading_hierarchy'] = extract_headings('\n'.join(current_chunk_content)) + metadata['chunk_id'] = f"${chunk_id}" + chunks.append(Document(page_content='\n'.join(current_chunk_content), metadata=metadata)) + + return chunks + + +# TODO, this function is duplicated in splitter_utils.py, need to merge to one place +def extract_headings(md_content): + """Extract headings hierarchically from Markdown content. + Consider alternate syntax that "any number of == characters for heading level 1 or -- characters for heading level 2." + See https://www.markdownguide.org/basic-syntax/ + Args: + md_content (str): Markdown content. + Returns: + NestedDict: A nested dictionary containing the headings. Sample output: + { + 'Title 1': { + 'Subtitle 1.1': {}, + 'Subtitle 1.2': {} + }, + 'Title 2': { + 'Subtitle 2.1': {} + } + } + """ + headings = NestedDict() + current_heads = [headings] + lines = md_content.strip().split('\n') + + for i, line in enumerate(lines): + match = re.match(r'(#+) (.+)', line) + if not match and i > 0: # If the line is not a heading, check if the previous line is a heading using alternate syntax + if re.match(r'=+', lines[i - 1]): + level = 1 + title = lines[i - 2] + elif re.match(r'-+', lines[i - 1]): + level = 2 + title = lines[i - 2] + else: + continue + elif match: + level = len(match.group(1)) + title = match.group(2) + else: + continue + + current_heads = current_heads[:level] + current_heads[-1][title] + current_heads.append(current_heads[-1][title]) + + return headings + + +class NougatPDFLoader(BasePDFLoader): + """A PDF loader class for converting PDF files to MMD. + + This class leverages the `nougat` library to perform the conversion from PDF to HTML. + It inherits from `BasePDFLoader` and extends its functionality to utilize the `nougat` library. + TODO, the load_and_split method need to be implemented and default is RecursiveCharacterTextSplitter + Attributes: + file_path (str): The path to the PDF file to be loaded. + headers (Optional[Dict]): Optional headers to be used when loading the PDF. + + Raises: + ImportError: If the `nougat` library is not installed. + RuntimeError: If the `nougat` command fails to execute successfully. + """ + + def __init__(self, file_path: str, *, headers: Optional[Dict] = None): + """Initialize with a file path.""" + try: + import nougat + except ImportError: + raise ImportError( + "Please install nougat to use NougatPDFLoader. " + "You can install it with `pip install nougat`." + ) + + super().__init__(file_path, headers=headers) + + def nougat(self, file_path: Path) -> str: + """Executes the `nougat` command to convert the specified PDF file to Markdown format. + + Args: + file_path (Path): The path to the PDF file to be converted. + + Returns: + str: The Markdown content resulting from the `nougat` conversion. + """ + # nougat ./paperSnapshot.pdf --full-precision --markdown -m 0.1.0-base -o tmp --recompute + cli_command = ["nougat", str(file_path), "full-precision", "--markdown", "-m", "0.1.0-base", "-o", "tmp", "--recompute"] + + try: + result = subprocess.run( + cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + result.check_returncode() + return result.stdout + + except subprocess.CalledProcessError as e: + print( + f"Nougat command failed with return code {e.returncode}: {e.stderr}" + ) + raise RuntimeError("Nougat command failed.") from e + + def load(self) -> List[Document]: + """Loads and processes the specified PDF file, converting it to a list of Document objects. + + Returns: + List[Document]: A list containing a single Document object with the processed content. + """ + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """Lazy load and process the specified PDF file, yielding Document objects. + + This method reads the PDF file, processes it using the `nougat` command, + reads the resulting Markdown content, and yields a Document object with the content. + """ + # try: + file_path = self.file_path + # Call the method to run the Nougat OCR command + self.nougat(file_path) + + # Rest of your code for reading and processing the output + file_path = Path(file_path) + output_path = Path("tmp") / f"{file_path.stem}.mmd" + with output_path.open("r") as f: + content = f.read() + # consider math expressions are enclosed in \( and \) in Markdown + content = ( + content.replace(r"\(", "$") + .replace(r"\)", "$") + .replace(r"\[", "$$") + .replace(r"\]", "$$") + ) + print("content: %s", content) + # extract headings hierarchically + headings = extract_headings(content) + + # assemble metadata from template + metadata = metadata_template + metadata["content_type"] = "paragraph" + metadata["heading_hierarchy"] = headings + metadata["chunk_id"] = "$$" + metadata["file_path"] = str(file_path) + # TODO, use PyMuPDF to detect image and figure list, but no link to the image for the extracted text + # metadata["figure_list"] = [] + + yield Document(page_content=content, metadata=metadata) + + # except Exception as e: + # print(f"An error occurred while processing the PDF: {str(e)}") + + +def fontsize_mapping(heading_fonts_arr): + heading_fonts_set = list(set(heading_fonts_arr)) + heading_fonts_set.sort(reverse=True) + idxs = range(len(heading_fonts_set)) + font_idx_mapping = dict(zip(heading_fonts_set,idxs)) + return font_idx_mapping + +def link_header(semantic_snippets): + """ + Processes a list of semantic snippets to organize and structure the header information based on font size, + and then outputs the structured data as a JSON string. + + Parameters: + semantic_snippets (list): A list of objects where each object has a 'metadata' attribute containing 'heading_font' and 'heading' fields. + + Returns: + str: A JSON string representing the structured header and content information of each snippet. + """ + heading_fonts_arr = [ item.metadata['heading_font'] for item in semantic_snippets ] + heading_arr = [ item.metadata['heading'] for item in semantic_snippets ] + fontsize_dict = fontsize_mapping(heading_fonts_arr) + + snippet_arr = [] + for idx, snippet in enumerate(semantic_snippets): + font_size = heading_fonts_arr[idx] + heading_stack = [] + heading_info = {"font_size":heading_fonts_arr[idx], "heading":heading_arr[idx], "fontsize_idx" : fontsize_dict[font_size]} + heading_stack.append(heading_info) + for id in range(0,idx)[::-1]: + if font_size < heading_fonts_arr[id]: + font_size = heading_fonts_arr[id] + heading_info = {"font_size":font_size, "heading":heading_arr[id], "fontsize_idx" : fontsize_dict[font_size]} + heading_stack.append(heading_info) + + snippet_info = { + "heading" : heading_stack, + "content" : snippet.page_content + } + snippet_arr.append(snippet_info) + + json_arr = json.dumps(snippet_arr, ensure_ascii=False) + return json_arr + +def parse_pdf_to_json(file_content): + """ + Credit to https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf, parses the content of a PDF file converted to HTML format, organizing text segments semantically based on their font size. + + Parameters: + file_content (str): The HTML content of the converted PDF file. + + Returns: + list: A list of Document objects, each representing a semantically grouped section of the PDF file. Each Document object contains a metadata dictionary with details about the heading and content font sizes, and a page_content string with the text content of that section. + + Notes: + - Assumes that headings have a larger font size than their respective content. + - It first iterates through all the text segments, grouping consecutive segments with the same font size together. + - Then, iterates through these grouped segments, identifying new headings based on a change in font size, and grouping the content under these headings. + - The function is designed to work with a specific HTML structure and may not work as expected with differently structured HTML. + """ + soup = BeautifulSoup(file_content,'html.parser') + content = soup.find_all('div') + + cur_fs = None + cur_text = '' + snippets = [] # first collect all snippets that have the same font size + for c in content: + sp = c.find('span') + if not sp: + continue + st = sp.get('style') + if not st: + continue + fs = re.findall('font-size:(\d+)px',st) + if not fs: + continue + fs = int(fs[0]) + if not cur_fs: + cur_fs = fs + if fs == cur_fs: + cur_text += c.text + else: + snippets.append((cur_text,cur_fs)) + cur_fs = fs + cur_text = c.text + snippets.append((cur_text,cur_fs)) + + cur_idx = -1 + semantic_snippets = [] + # Assumption: headings have higher font size than their respective content + for s in snippets: + # if current snippet's font size > previous section's heading => it is a new heading + if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']: + metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]} + #metadata.update(data.metadata) + semantic_snippets.append(Document(page_content='',metadata=metadata)) + cur_idx += 1 + continue + + # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create + # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific) + if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']: + semantic_snippets[cur_idx].page_content += s[0] + semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font']) + continue + + # if current snippet's font size > previous section's content but less tha previous section's heading than also make a new + # section (e.g. title of a pdf will have the highest font size but we don't want it to subsume all sections) + metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]} + #metadata.update(data.metadata) + semantic_snippets.append(Document(page_content='',metadata=metadata)) + cur_idx += 1 + + res = link_header(semantic_snippets) + return res + + +def process_pdf(s3, pdf: bytes, **kwargs): + """ + Process a given PDF file and extracts structured information from it. + + This function reads a PDF file, converts it to HTML using PDFMiner, then extracts + and structures the information into a list of dictionaries containing headings and content. + + Parameters: + pdf (bytes): The PDF file to process. + **kwargs: Arbitrary keyword arguments. The function expects 'bucket' and 'key' among the kwargs + to specify the S3 bucket and key where the PDF file is located. + + Returns: + list: A list of dictionaries, each containing 'heading' and 'content' keys. + The 'heading' key maps to a list of dictionaries with keys 'font_size', 'heading', + and 'fontsize_idx'. The 'content' key maps to a string containing the content under + that heading. + [ + { + "heading": [ + { + "font_size": 10, + "heading": "5\n1\n0\n2\ny\na\nM\n8\n1\n", + "fontsize_idx": 2 + } + ], + "content": "xxxx\n" + }, + ... + } + Usage: process_pdf(pdf_bytes, bucket='my-bucket', key='documents/doc.pdf') + + Note: + - The extracted headings and content are dependent on the structure and formatting of the PDF. + - The S3 bucket and key are used to download the file to a local path for processing. + """ + print("Processing PDF file...") + bucket = kwargs['bucket'] + key = kwargs['key'] + # extract file name also in consideration of file name with blank space + local_path = str(os.path.basename(key)) + # download to local for futher processing + print(local_path) + s3.download_file(Bucket=bucket, Key=key, Filename=local_path) + # TODO, will be deprecated and replaced by nougat class in loader_utils + # loader = PDFMinerPDFasHTMLLoader(local_path) + # entire PDF is loaded as a single Document + # file_content = loader.load()[0].page_content + # res = parse_pdf_to_json(file_content) + + loader = NougatPDFLoader(local_path) + data = loader.load() + print("raw data: %s", data) + markdown_splitter = MarkdownHeaderTextSplitter() + md_header_splits = markdown_splitter.split_text(data[0]) + for i, doc in enumerate(md_header_splits): + print("PDF file processed successfully, with content of chunk %s: %s", i, doc) + return md_header_splits + +def post_process_pdf(s3, pdf: str): + """ + Transforms a given string of a specific format into a desired formatted string. + + The function extracts the 'page_content' value from the input string and + constructs a new string in a JSON-like format with specific hardcoded values + and the extracted 'page_content' value. + + Parameters: + ----------- + original_string : str + The input string to be transformed. Sample: + str: A string formatted in the desired JSON-like structure. Sample: + [ + { + "heading": [ + { + "font_size": 10, + "heading": "5\n1\n0\n2\ny\na\nM\n8\n1\n", + "fontsize_idx": 2 + } + ], + "content": "this is the content\n" + } + ... + ] + Returns: + -------- + str: A string to conform to AOS embedding wrapper. Sample: + List[Document] + [Document(page_content='this is the content', metadata={'source': '/tmp/tmpghff3i39/xx/dth.txt', 'timestamp': 1697513348.1026106, 'embeddings_model': 'embedding-endpoint'})] + """ + print("Post-processing PDF file %s", pdf) + # Parse the input string to a Python data structure + input_data = json.loads(pdf) + # Create an empty list to hold the Document objects + documents: List[Document] = [] + + # Iterate through the parsed data, creating Document objects for each item + for item in input_data: + page_content = item['content'] + # Assuming some default metadata; adjust as necessary + metadata = {'source': 'unknown', 'fontsize': item['heading'][0]['font_size'], 'heading': item['heading'][0]['heading'], 'fontsize_idx': item['heading'][0]['fontsize_idx']} + doc = Document(page_content=page_content, metadata=metadata) + documents.append(doc) + + print("Post-processing PDF with result %s", documents) + return documents diff --git a/src/scripts/dep/llm_bot_dep/loaders/text.py b/src/scripts/dep/llm_bot_dep/loaders/text.py new file mode 100644 index 00000000..04c00f76 --- /dev/null +++ b/src/scripts/dep/llm_bot_dep/loaders/text.py @@ -0,0 +1,12 @@ +import re + +def pre_process_text(text: str): + # Remove special characters, punctuation, line breaks and multiple spaces with a single space, + str_doc = re.sub(r'[^\w\s]', '', text) + str_doc = re.sub(r'\s+', ' ', str_doc) + str_doc = re.sub(r'\n', ' ', str_doc) + return str_doc.strip() + + +def process_text(text: str): + text = pre_process_text(text) \ No newline at end of file diff --git a/src/scripts/dep/setup.py b/src/scripts/dep/setup.py index 21a928aa..3f1c756d 100644 --- a/src/scripts/dep/setup.py +++ b/src/scripts/dep/setup.py @@ -3,7 +3,7 @@ setup( name='llm_bot_dep', version='0.1.0', - packages=['llm_bot_dep'], + packages=find_packages(exclude=[]), install_requires=[ 'langchain', 'opensearch-py', diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index 0fc44524..202863bb 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -19,8 +19,8 @@ from awsglue.utils import getResolvedOptions from llm_bot_dep import sm_utils -from llm_bot_dep.loader_utils import NougatPDFLoader from llm_bot_dep.splitter_utils import MarkdownHeaderTextSplitter +from llm_bot_dep.loaders.auto import cb_process_object from requests_aws4auth import AWS4Auth @@ -41,321 +41,6 @@ credentials = boto3.Session().get_credentials() awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, 'es', session_token=credentials.token) -def fontsize_mapping(heading_fonts_arr): - heading_fonts_set = list(set(heading_fonts_arr)) - heading_fonts_set.sort(reverse=True) - idxs = range(len(heading_fonts_set)) - font_idx_mapping = dict(zip(heading_fonts_set,idxs)) - return font_idx_mapping - -def link_header(semantic_snippets): - """ - Processes a list of semantic snippets to organize and structure the header information based on font size, - and then outputs the structured data as a JSON string. - - Parameters: - semantic_snippets (list): A list of objects where each object has a 'metadata' attribute containing 'heading_font' and 'heading' fields. - - Returns: - str: A JSON string representing the structured header and content information of each snippet. - """ - heading_fonts_arr = [ item.metadata['heading_font'] for item in semantic_snippets ] - heading_arr = [ item.metadata['heading'] for item in semantic_snippets ] - fontsize_dict = fontsize_mapping(heading_fonts_arr) - - snippet_arr = [] - for idx, snippet in enumerate(semantic_snippets): - font_size = heading_fonts_arr[idx] - heading_stack = [] - heading_info = {"font_size":heading_fonts_arr[idx], "heading":heading_arr[idx], "fontsize_idx" : fontsize_dict[font_size]} - heading_stack.append(heading_info) - for id in range(0,idx)[::-1]: - if font_size < heading_fonts_arr[id]: - font_size = heading_fonts_arr[id] - heading_info = {"font_size":font_size, "heading":heading_arr[id], "fontsize_idx" : fontsize_dict[font_size]} - heading_stack.append(heading_info) - - snippet_info = { - "heading" : heading_stack, - "content" : snippet.page_content - } - snippet_arr.append(snippet_info) - - json_arr = json.dumps(snippet_arr, ensure_ascii=False) - return json_arr - -def parse_pdf_to_json(file_content): - """ - Credit to https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf, parses the content of a PDF file converted to HTML format, organizing text segments semantically based on their font size. - - Parameters: - file_content (str): The HTML content of the converted PDF file. - - Returns: - list: A list of Document objects, each representing a semantically grouped section of the PDF file. Each Document object contains a metadata dictionary with details about the heading and content font sizes, and a page_content string with the text content of that section. - - Notes: - - Assumes that headings have a larger font size than their respective content. - - It first iterates through all the text segments, grouping consecutive segments with the same font size together. - - Then, iterates through these grouped segments, identifying new headings based on a change in font size, and grouping the content under these headings. - - The function is designed to work with a specific HTML structure and may not work as expected with differently structured HTML. - """ - soup = BeautifulSoup(file_content,'html.parser') - content = soup.find_all('div') - - cur_fs = None - cur_text = '' - snippets = [] # first collect all snippets that have the same font size - for c in content: - sp = c.find('span') - if not sp: - continue - st = sp.get('style') - if not st: - continue - fs = re.findall('font-size:(\d+)px',st) - if not fs: - continue - fs = int(fs[0]) - if not cur_fs: - cur_fs = fs - if fs == cur_fs: - cur_text += c.text - else: - snippets.append((cur_text,cur_fs)) - cur_fs = fs - cur_text = c.text - snippets.append((cur_text,cur_fs)) - - cur_idx = -1 - semantic_snippets = [] - # Assumption: headings have higher font size than their respective content - for s in snippets: - # if current snippet's font size > previous section's heading => it is a new heading - if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']: - metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]} - #metadata.update(data.metadata) - semantic_snippets.append(Document(page_content='',metadata=metadata)) - cur_idx += 1 - continue - - # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create - # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific) - if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']: - semantic_snippets[cur_idx].page_content += s[0] - semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font']) - continue - - # if current snippet's font size > previous section's content but less tha previous section's heading than also make a new - # section (e.g. title of a pdf will have the highest font size but we don't want it to subsume all sections) - metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]} - #metadata.update(data.metadata) - semantic_snippets.append(Document(page_content='',metadata=metadata)) - cur_idx += 1 - - res = link_header(semantic_snippets) - return res - -def pre_process_text(text: str): - # Remove special characters, punctuation, line breaks and multiple spaces with a single space, - str_doc = re.sub(r'[^\w\s]', '', text) - str_doc = re.sub(r'\s+', ' ', str_doc) - str_doc = re.sub(r'\n', ' ', str_doc) - return str_doc.strip() - -def post_process_pdf(pdf: str): - """ - Transforms a given string of a specific format into a desired formatted string. - - The function extracts the 'page_content' value from the input string and - constructs a new string in a JSON-like format with specific hardcoded values - and the extracted 'page_content' value. - - Parameters: - ----------- - original_string : str - The input string to be transformed. Sample: - str: A string formatted in the desired JSON-like structure. Sample: - [ - { - "heading": [ - { - "font_size": 10, - "heading": "5\n1\n0\n2\ny\na\nM\n8\n1\n", - "fontsize_idx": 2 - } - ], - "content": "this is the content\n" - } - ... - ] - Returns: - -------- - str: A string to conform to AOS embedding wrapper. Sample: - List[Document] - [Document(page_content='this is the content', metadata={'source': '/tmp/tmpghff3i39/xx/dth.txt', 'timestamp': 1697513348.1026106, 'embeddings_model': 'embedding-endpoint'})] - """ - logger.info("Post-processing PDF file %s", pdf) - # Parse the input string to a Python data structure - input_data = json.loads(pdf) - # Create an empty list to hold the Document objects - documents: List[Document] = [] - - # Iterate through the parsed data, creating Document objects for each item - for item in input_data: - page_content = item['content'] - # Assuming some default metadata; adjust as necessary - metadata = {'source': 'unknown', 'fontsize': item['heading'][0]['font_size'], 'heading': item['heading'][0]['heading'], 'fontsize_idx': item['heading'][0]['fontsize_idx']} - doc = Document(page_content=page_content, metadata=metadata) - documents.append(doc) - - logger.info("Post-processing PDF with result %s", documents) - return documents - - -def process_text(text: str): - logger.info("Processing text file...") - text = pre_process_text(text) - - -def process_csv(csv_content: str, **kwargs): - now = datetime.now() - timestamp_str = now.strftime("%Y%m%d%H%M%S") - random_uuid = str(uuid.uuid4())[:8] - bucket_name = kwargs['bucket'] - key = kwargs['key'] - row_count = kwargs['csv_row_count'] - local_path = f'/tmp/csv-{timestamp_str}-{random_uuid}.csv' - - s3.download_file(bucket_name, key, local_path) - logger.info(f"CSV file downloaded to {local_path}") - loader = loader_utils.CustomCSVLoader(file_path=local_path, row_count=row_count) - data = loader.load() - - return data - - -def process_html(htmlstr: str): - logger.info("Processing HTML file...") - # filter out DOCTYPE - htmlstr = ' '.join(htmlstr.split()) - re_doctype = re.compile(r'', re.S) - s = re_doctype.sub('', htmlstr) - - # filter out CDATA - re_cdata = re.compile('//]*//\]\]>', re.I) - s = re_cdata.sub('', s) - - # filter out Script - re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) - s = re_script.sub('', s) - - # filter out style - re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) - s = re_style.sub('', s) - - # transfor br to \n - re_br = re.compile('') - s = re_br.sub('', s) - - # filter out HTML tags - re_h = re.compile('<\?[\w+[^>]*>') - s = re_h.sub('', s) - - # filter out HTML comments - re_comment = re.compile('') - s = re_comment.sub('', s) - - # remove extra blank lines - blank_line = re.compile('\n+') - s = blank_line.sub('', s) - - # remove hyperlinks - http_link = re.compile(r'(http://.+html)') - s = http_link.sub('', s) - - return s - -def process_pdf(pdf: bytes, **kwargs): - """ - Process a given PDF file and extracts structured information from it. - - This function reads a PDF file, converts it to HTML using PDFMiner, then extracts - and structures the information into a list of dictionaries containing headings and content. - - Parameters: - pdf (bytes): The PDF file to process. - **kwargs: Arbitrary keyword arguments. The function expects 'bucket' and 'key' among the kwargs - to specify the S3 bucket and key where the PDF file is located. - - Returns: - list: A list of dictionaries, each containing 'heading' and 'content' keys. - The 'heading' key maps to a list of dictionaries with keys 'font_size', 'heading', - and 'fontsize_idx'. The 'content' key maps to a string containing the content under - that heading. - [ - { - "heading": [ - { - "font_size": 10, - "heading": "5\n1\n0\n2\ny\na\nM\n8\n1\n", - "fontsize_idx": 2 - } - ], - "content": "xxxx\n" - }, - ... - } - Usage: process_pdf(pdf_bytes, bucket='my-bucket', key='documents/doc.pdf') - - Note: - - The extracted headings and content are dependent on the structure and formatting of the PDF. - - The S3 bucket and key are used to download the file to a local path for processing. - """ - logger.info("Processing PDF file...") - bucket = kwargs['bucket'] - key = kwargs['key'] - # extract file name also in consideration of file name with blank space - local_path = str(os.path.basename(key)) - # download to local for futher processing - s3.download_file(Bucket=bucket, Key=key, Filename=local_path) - # TODO, will be deprecated and replaced by nougat class in loader_utils - # loader = PDFMinerPDFasHTMLLoader(local_path) - # entire PDF is loaded as a single Document - # file_content = loader.load()[0].page_content - # res = parse_pdf_to_json(file_content) - - loader = NougatPDFLoader(local_path) - data = loader.load() - logger.info("raw data: %s", data) - markdown_splitter = MarkdownHeaderTextSplitter() - md_header_splits = markdown_splitter.split_text(data[0]) - for i, doc in enumerate(md_header_splits): - logger.info("PDF file processed successfully, with content of chunk %s: %s", i, doc) - return md_header_splits - -def process_image(image: bytes): - logger.info("Processing image file...") - # TODO: Implement image processing with ASK API - -def cb_process_object(file_type: str, file_content, **kwargs): - res = None - if file_type == 'text': - process_text(file_content, **kwargs) - elif file_type == 'csv': - res = process_csv(file_content, **kwargs) - # CSV page document has been splited into chunk, no more spliting is needed - aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index', gen_chunk=False) - elif file_type == 'html': - process_html(file_content, **kwargs) - elif file_type == 'pdf': - # res = post_process_pdf(process_pdf(file_content, **kwargs)) - res = process_pdf(file_content, **kwargs) - aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index') - elif file_type == 'image': - process_image(file_content, **kwargs) - return res - def iterate_s3_files(bucket: str, prefix: str) -> Generator: paginator = s3.get_paginator('list_objects_v2') @@ -458,9 +143,15 @@ def main(): logger.info("Running in offline mode with consideration for large file size...") for file_type, file_content, kwargs in iterate_s3_files(s3_bucket, s3_prefix): try: - res = cb_process_object(file_type, file_content, **kwargs) + res = cb_process_object(s3, file_type, file_content, **kwargs) if res: logger.info("Result: %s", res) + if file_type == 'csv': + # CSV page document has been splited into chunk, no more spliting is needed + aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index', gen_chunk=False) + elif file_type == 'pdf': + aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index') + except Exception as e: logger.error("Error processing object %s: %s", kwargs['bucket'] + '/' + kwargs['key'], e) else: From 46eff33744a61ac40b19407488f452ef56e7ec1a Mon Sep 17 00:00:00 2001 From: Xu Han Date: Sat, 4 Nov 2023 18:38:08 +0000 Subject: [PATCH 16/21] chore: update new dep package --- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 16515 -> 20854 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 47ba1d0c008bf739472cc4173c1db4035e7ae328..2648f6bd68ac352d513bda6f2a858276c766ff57 100644 GIT binary patch delta 12180 zcmZv?1yCMavo-wS?(S~E-5r7x{K4JbHQ_-51b5fq?(VL^9fC`6cLMz6-1ppb?*D#M zwP)AVtkpBKcTIO!_X_j}yUPVbQj&v!!~_5UumB@YlUO7U2s2KT?-G!IraCD|H6-*P zaT6ekProk^1^~EzM-l+yS%io{6}UDItBrR)zEE4F)sVwkL^L+oKZqJl^?V#=Fcr?i z$dDjqbL}SLieuG&atwnuct6v|ZJ!oO!YxsIWeG47r{N^Vq)9jSoigBzQ86d`mK^2B zk(O0G(DBUbQW{v#?BrvQVf+fuQx`0r7pn6NQE;>{ z*jON z;K^ABN|kDBh_U)zKwTD_3eg1eFl`n)3GhIZM)e-zao=TN@Zur@+T|v26RFtd0@^LK zot5I+>7;Vl@xy;Vg<`8uU~0k2Ocd*~JuoG6292}~YgQ_?>9G$_fXmz40t}MIgn>B3LnlGwdJ6LH!yyH^m zk@wWIp+oh-6L4%CK4hS|kN0xI#WRD>Ytc{wt`hinWK1L5O%-7i@{DvabREDEuf zXEZhDhQ!h<7lUOP1N3>R^IBkUG)1`SG7-?oF=My8)ULY+QpDdIA&pZLNpagp0)$ql z7C8tT-J`r(PjwP4HCb4!fd$)H5V#eS_^dEZb-W!nExsqLPe77xXL#DN1c0al+IsT|JZN-*`Z> zV``@Pq6MT#A|T_~kA4#ei^_h6{Y5@`4sC#^_$g|-`Po!*Z`hS_VoPW{Yg(yh2tOxbH>EIbvgA;Eq zLF39#*d-MxQ5Mj>4cY5o1))(94J8WG z;pPuZ_j=5+c~MsiUFd8zJSp@?(rycI!Q1ym&T~TBSb@BsmW3V9vhmmH3Z}44R_V}3 z4qBTWKAB5@#^>r4$@Go6b73aehwBUY4w}s}OL}yz<#hN`W_W58`1;!88cKB+x%Se7 zOQ@-Y#N5Ru;<-5o+xJai&4!^dJQy5V2`WWX^Q+?x+qoHWm{s?+)Sk{4|9O~zA-KmO z>~8OY1&Xn$cD^bmMP|}Hl8vn0-7Am96A6i0@LE`cyRZHn&;-1BlRgQp-fD{i$DlJx zvG7y}gqmE@wmy#bZvp`Z&y~ROsWNF!MVF|T6Q%Ls&bqHS_~bTZ26|HXE7E=Z$C;*>yjs~v%K<5TF^7C}3K0S#;tFJxu%}NLuB1;6|0g&DyFZt} z#X|xBuLuA@9Y1~Z{{-nf<^dS=c1{!Edk_bJ|2e%MB!7Z5CcO)@fq|91m8*dPlcN^| z;2(2~5+=ZFH!|quckKK->93%U336dJa&vWf$7U+aAP2G_w*SIP@&NNNY9wfME6g%0H@8xD2- z1vYQpi-_HQy0Z0XAjH6yPl0&t_H2?Z?vJ%EJ;i-*@E`I+j!q~I{#gU+AX6aNRB(9$ zGyvd*0|4OvwPx>YVB+Hbz61@LJ`Qs{sBa+QAS^Xb1gjj(DXRiOxM}jK9x`QQ*CX|w zaXvkRcpeM>FU70pzU+#&H0ib^j>)n*!sdh5SCb~!x#%kKt^46J)RLmuYV6|(A3fPk zp2y2OOFg};f8BYNQe64dKfI{@PfpWe9``4u83w(UXr(r`>uPo~pQv=b-6+hI8auOD zNvXOCS<$*^XBEngj(L))K;`0c9R(z=Fs|K6;r{T9LufTr86a)2(^=pSF%VAi=l4ox zfPqSMftqEQmlZipfXT}1}cjrgVEq2nO#-!se;*Kfdh-!?zuv@a00a){6 zc*@#OPE^*ZY!{aLID0?HdeKmc0z*I{<+uG%vzXOEjl0(SQ@t}Gs-Gg;QbtwhV?kk? z?3gz9dwOHD+TrfPK1mp4&N-juT;|E#wAlK-e6?~YEEJ-g@OkL?$f@TrZqO`TBWWmn z=WZWuG#)$rb*pi;0}sZ`n_S7X0{8nUA~)NUbEu1~{GHavW5R04r76xYqeia!;^-Wg zqMX`TO7!zuOSJ>O-?KDH1{Y1iF2zCUUP0hS_5MN$8e}<*#NbJ|;xDOWDX+zXLW2kL zs|4G>zU)O7PBu|NSj|`>Ymn9Rv(-hx#TT3-oKPd4AuY)VR@OY@y;di8O>#eqvz@0&U#HVHq>S--$8uTk<=Y;}$x=Z3gh)uW zIfy6YPl-JVC%>&r!zp}Nh@T^a62UaRTm=sOzj9cCW4yG(tzwa1J=8cQ&U+uVOap^& zQ>FO8)WuC9db*o5SE{F~I8GgIrSjtd=tkHVcrE+fJ+6Hm-yp#s^l1Elq#|XkNx-l= zaUhfK4{OecZ(q_fP7cQ;I7)(KzboWMJF|-P2T$((@T+uyX_MC~7;!i`5|Jef;gT5@ z-3O*E{!ofYPl5`GJgvxGT@*icXpFqVgG7}h>dFk{cdaJ_1*EK}Z>6UH#6M#sC0V)t z*1m^+#3P@CfQ;H@s}dWa5~e;k*AG!yFurKWIoGI38RY^YQ#U56?7{|$R5nD;mfR&2 zgqUMpELYlcwG@#eava9lypVPX4`uM6`9UUfe@`gV=xUw-2X`hH`1M^E zv%r2M>j$X;P11yL)D`i7n)sj1<{Z%^34Ov~majrRu8eXf!BwH4i;*5o@g{7+JG7qu z9^Iq0V?Ulpf>B1S`!n)R=~{|F33jPOfJN}jjvzTz+pyQjzUZc0TML;pC(*+1Mt6cp z46Dl~ovQ{!A$z;gS7m3^u{*riv;>Jun*h5}g;I=S~Q3kk)o3 zQVgB=v~YDz$GHP&&F3CIqD^CI@MCO~RK>5Z=Z}sj{9sRYtEASr=i@7z=G$iDXEoUb zy3dHH-Qn2FwK+00KTUDFjyt8m-a@_LO^@lk3}1U99VszD)|pN$BBaIn(y1zN4?Jo< z`_>7~7zo)qo6s)qMwLvFLR539n6gH|FbcEawALmztP;%)z0=kltsoHI0Ud@4s5(&0`CDw4CjMZ5lqW*jO^3N zP;&qcGch^)W3H~3l`D%dz9ERMyR57YI>$W9Jz6lUipE?U>DUmpZ+jZlH5CqG#yE2o z%N72u#+}PblxR9GZLA~xXsg|`C$~@w%JOhzk%h7AhuSKToyja9UxM*loNSYBI4@pfPVFt+lUtKCjwVXE1DppO{j`Zt z>M5lGr(U^VDDyJ^5b#V*PRl*2* z?4r6-$1H1tVo~qNEznXL<5kArw+shb^rVicA1x1Wd`?bY4`@kjLn7*H1`=;_Wbt9ZvP@NYbPt;)wh#3`RLv0}WXG5jlHK)eRLOd?guK$?awyhu(t>h}4zL&0XM zPE34SdKxenpaD6XlA2tpl%*B-eG~VjViR{0Zb3ChQ}+kdzfwaHkO?q6V;znb8vqEG z`I8m_0JgSv2F4EYJVJ=?>5-$U`JePiz{X|uW8B6gePOXJT7;-{t%pln`;@x=YRRo0 zVHv|l2z{tzV1JCn*F|H1T!h@)ORLX1fR?VaU%f^qr>L!{sAxkz4QRGjtrDT~ShD{# zTb0Z3akcfwvi9%9l90Dur`7ybt~Q2MS@AG;z4^Uh1hTEN=4Ft@p>|b^U)7bQOYw|x zQTAI#7`3Ux4AO5#VNh{eJ;OGG={dgskb+joN!(l8-av3PWf%jKd-|6QY-JUDND;^1 zQ~T|f9P8Xi&)wkOoui{XTMaC~J01_eXkXyJMh5!<8EDTd&6E2h` zW(Fa3*=XdB&On3Q3xMs>$Laa)jIRSuBQ$^(HPk%iiFBUObkF&V_R;O&I;dwp;|q&+ zx^QSOfb!L4Mr2uYl!u${>Ckt`B?uwZE9>k4$m8)>fRe%a{Fq&A&U*6Yq=&ms%kwirHplqt^43AU+kqUHd^>$rGh+F0 z256A&k8~4|_apwfJ>f2xzxYuV@huJ=EU|CKV!=RCU^|_FlAqaYl>$-G?YR(J#uz{W zFib~C0s$~>`c%ML`?STjEJ8{*C|8>t?&$J1?R;jNp0SN2#P;*4aB6FHyR?AxujjeE zM0+YCH)PwI1G2k&PSU|oZeukBzABO#>eN=h2_XSH$9v{j73ewsTBEdySp`J zvS?N#?!VW7FPpvs%_{o2w3Q!Ft#Z!IH}!j=^xF;=*%-kc7H6lX^bSyFx9o1uGupud zxK)-%hSxaJFYamOfI4e+=Op1nA?qC5sl=f9)@FN1{1MWWjEriTVd@h|Vfi8#(_e`I zlmKi%BVd7VOA;iftJOBm1C~;lW z(9Jlej>7xmZBDhmb28#_ZBBq2ydRiFJv}QdvKP{H?>aAh78|7tHsT4a60IPfJ_U)2 zbw;IgM&XNHT;w1NDct);PrQt@-aWgWOsamfpQ52t?!Q0mOGo+6VY=G5WO&uR>GHm- zbztZ5hMPGM=U9|Xc#x+dsIHLuqP8j5-b0rV{iYFw&#-;fSr=FR7qro~0>oX3B7^d* zUjQP!W5^NC03s#pU%1mpi&S(*Mmx>#b*S-0fU^CxmPaqX zWSrS=gvm|nzP#lVqnux{0+n*e!3|a)ScL^UIyxeq2R19Af^%!vGb>GE?akw|Rkg26 z*LnSu6QK}Dvj7M}?IY2y+7n^~!;{POJm9|vSy;Ohs)VMjhm#UWSezp_(?Fu={df{f z>c_gNRavhJLIp^G%uoB7I5$_ddfZoEBv3z)@IhZ^7z)Hpz)PMzQtW2YI}4S@zQHAg zzH+8<#M~Sp1hQaO+8E+_9&c_CPZ-X9sxBt9vfQ>tP77|~qIssyU|@(&A196x3sWEo z^hVDRNUtUJ=E{_YR9)=4p2h>N+Pd@XWuKIZKb`#v?5%P5f1#UjhK;%9Ql5sy1 z?Ph@xcv*L=)Reeu4ZE>wb|Gv=@tDEAiB6AdD*~>gwiZ7(vrHBtKY^HVf&=xRiq+?z zeK~0qpGFEl^Ya_%qajEE>Os7XvL($|nA8CrSfRCR1Cb@Okoxn3X?H+PbmQ95Ja{-y zE5kAa&$S%8PNcrP} z?+P;-JHaDrErayE45$ZfDtQ;G+29Gc;8dWi@S3y&x$h`K`p^z>UbUYosv3j7>T?>V zFWBwuUl%UrcZ||-z&sGql7G_P4TIpzq+~n3*Il>)*As6SI|jiePhnyqeUq4i-+XbO zL=Y?JKvq(=cw|)!8r$Umu-Z{^T$^PRzre3ywoW>1OA71w_`K9BIPw^Ko6a_=9JPuo zhG`nJ;g35#+CWfVX0HQFN4~=tJSO)YYdgaqRs8NtF}K|S>0W>kbO7PRj_alPR41Lt-gkDNW1Xg47?mVNlW+MJ zd5vG{Eq-^Ijd3{{vH0=Lt_uNUz@IIJrCz;hZZH{tYB&~tT>m)pu@~X?d@d_WVpW)c z;hQQW3i1fQU(?mxK(%vsWg=YeeeQO;YkB~78?X=3#8L;eO+SiDoob+lp`F7j^eH~Jc8f)D+Z z=AJU$^qKmH;hAFjOyMR+Z!S}0R&e&dj}6|ao`a4w#ihbtGOT`;d<-_zUU#Ij;F!-^+pCG)H%g2oWP< zP_<;@r@rsahsGmNxi!jB^;Zcg*^V~OuCa~yG zWQ`b5j@~5^);if99`6&3gd|!Tcok{35HL~ zB_Y;*pqS95LP*0Yf_1#)zJDF~IkAxjKXefItk|c7ERg4m^7#m#xJ1$C^oCG-qZ5i` zJ}=r;ZGDl5Y@5UpUW;5tYOsdu{TN8=jDcYDse+|=wZpK0>$rs^`yCQ zvCi@0n3#VHdKtNUDT*Kbku`(@EVbAO^0SlYMyW3A77~6qM@W7=hr->{$Wbt2EhB7J zb-QDp2|rxP*uLyd%2tn5NXl3y5wRv{w6MybVrL+DtaEVmwycfflL0MwrJrs0avtd;=4vfP>AAf>8JO0@hiH02X}BAsnQ(R8 zV{N}T!Zt*N(}6U9$=NN*SU*Ub;u=y?9w;qz7KJmV!{g?qO8^3UMA%HobB9$)MbHNvp*T^ zI`~@<%dU-2rMESE&OpwMglY<=B z@tnHvS<|1lTiKS9@B4lp;+Kvs&MmXcBA8L0&fu-YAI^Uk;x}*)6Vj0q5k`i$T&~g- zrk*u(7!bc=Cy5R}S*5%KA;qyD9b4y`uLQD{gVPg(bq(c3FrvWbq*N2z`d)Gn=NByG2AQvsJa&V7K+pts4hVu)M`#Vwr%|(XH$+^EazsN^vW$UNl4?f|zttG`*UI!Qoo~QMWbX5T& z38hPy_!i`u8kN6VKoL`4fB2g0mo~bGP~R5JUYKHWr+H9)H;|oGN*ReIR|!}e7jZO% zAWvytclz}cRhEPrwp^^1>D z=1KB?jI2x34isdal0xIUfWVd7>>r}UE)!EOt-hR~6?_lP*&jTTAMfoxEbNO>V|(H0 zZ6XHV)J!H67+P%XkpOhtlyu7%wQ-zsFN2*!bmRz$Kq0jv78L_8;dK%N?<4x*2y?y3 z=fzcVM*ZE=+p}q#&XaBc186a}?(Kz5MELZ%alIQ$2&7G@Co*)8Xm(i8_VU^&#CfU+ z(ZQEc%>QytW-gFMJVHat4G;4jfgvlCM8;ZvxMYxyE-1@GenV*OG~50mf(age1&fNC zqRGYX=~jYg$za6`*{pP{!5un0SyQY44P{927PF>FdH2`RXSiXX^U8#bBq^7)E&sLw zV0`fpWzeaZIdzEDaz$U5Jam#BEp0 zBdx$=D9hJRm?>Cs3U@qte<#i2y&C_4F;-zVR?@HUy|5N^P-B{&)^^*3!SheyUA@mXDG_WB9a)sosyo8;!M4a5FiQhWeN`V=5t~d0>MD6XOBZq$h|C z8#E=4$XuWU#J3)}f=Np(N=p!4$nY$bDDV+OE)rB?b-fnz12$-=#|)ISpfZbLK4(;* zAPv53fIuz?-ydR)DKe&!+m(&3xj1*ex>;{Q2ijvrc!OE(Ae*KF)bF;peTCJFT?!7= z+;4B9!Ns+{ahNK6nVCvXEgXaBXSTOk08vWQ;n>elJ($xl^Tmnf))cqFt>n}C2Io%UlnFX)ejdJtibnU+n15WIcX@-5HvIy z8iyj$M_nkyQL~68#J3dMtYexc4d;W%i>dCun(BFn!35Kh-ao3^$>tEPmb8yN%u=ki zoiMxfs0Cta4fOckbiDn$m8Xp<%K#_jn7or|-2ewv6tY86*73CZ+3@!~N`8{mk)gb;j!n?#MWoUKZR&jt7`uCsNMz zBQ^eRC3tyCu<*{IcN$uCAon8tu~t+HIxvjA@oC$9;pOj z4hT$wV{8~x80rm}3evG~a)O|S0)_fFTe6!+d@l)9k(jF=9QJbpK|I;+>)}flw5L&jU&lR#k4EL%iBva+vcW0Bi*UD+^kvyV6QR{E$Culz4d*-k3tH2a|KsQsFFa9sv z{u@s8`Chr(xGiD6eA5FMyj_EZJerFXb11akA`cb$Oe#h3`3~uK{vdb9QKAc^Ju1i1 zj4N(45SsO3Tq%x~TWD!<8Rob(%T4Z8odulGyc5|J_VPEmN1CvBg+s+OdZM3(1t=!W zX8oSq9!C6^LGW2qDhj$mY1Pb53(ouqY1ryErCr_!uoci+ih2x6=d)sNeijLWAE0G| zgO9)H((w2vNXz#2JAhuN`cQ#V#xpVnVz<(9qE|?^OxO-hqz#sCFLHXcC6k7gB2H zUm$K^I8R%MWlo_RIpEeZ)DrAEwqmRm71P^$-`t;1?FVuD*A_L$2#c0NpIi8oatSh)!u1?66--(aQ&ReOn0;1-n0v zm-_xDQy{{nKLt%QO-+qFrpi1|otDXbDiH7Y`em6BTa3ZkGP>p)d6VMoO_dN-{oP9KiaFt6v;FHWkF{s- z+1N|hw-a5GsGLG;_L>hnzh|o(=|LYdG%5Hf_@aB?`t% zi^YT;PH#jzs?uQ&D&58nP&gpPbKfFl`w&yEh7CLrxPIjx^dv3obPC zLTj+7&r9P-6p}etQ?k*nHP9KMDBIs-)!I4c^Zv%5jB_&&=)mLB2rUqoU$!W?=w&) zmuyV;{Zs!8GC;htFmZeZFDoSL6#G>CHZK>%A1nc%%3oM{T#@LDcj<;B6aYZ}7Z#)< zDgH@CqGEO=u2dQqYV+j_cDby%<_&l%8Uq-VnK(YC$S!J)&?*-v+-{onOoK)88wt0Bd zN4U1$NS#Xf36EnW<&Y6O%_Dx&uB-xGb&FWwro%|QXneqo=2D@fnqJVqQ4lB5+H%Sg z=fL2g{|sXUq}GE5<#cHJ!>mDG9vx)bfmODMF0PD`@tP}mu7tuB92b0=%#er?Fep9Ml!ReQ&9uC(RPkp}Upn(; z2(C+3pa4R1^A!9JS*dmvudLF8N@nOqFfQ+aL;ofRT1m%KhZ}v~Pbo|HX6&rd5WTHoGotjYxwzEz>qUiv8q=VxsV} zKsQbRW$NO185i9q2E1sFznt3qA?H(yYNYJ8%i_3p4Z^YHZUa{9eV-x`sHA|pe-YvK z@z#rw(1?3@B3`4}j!_DjyfA4f1S{)(54;j;?x+{fl7GSzg$Ez#;$YRiz*v!C<5PZ~ zYHJHQs^+w^o<6lYkfbpu^pS2FV(1uUL$VFd;XO+I>BoJA)V??G$F1%mc^>7RfO$y! zMbApG^k_npM47y)%xS|4L=!RfrA6L?LNx{nW5-;+mfH+bG`6?O-Lkm!<@V7nIB(4R z3hxlzRoC9mGi^t>yf}>m9zKh*(5b#d9jHRuzrk)YH-wa7cpZ%M+s(wqP1#5n%j1yi zm+4Pe6dP3dJ^y>3R0KmO?TRji6E{_i-Sf_(QIae%RM0Dwk{lQ~0w%=&Dw*Ri1;qYn znB%zw#s8W=3rdmxx0Lz)Kl{&7?;nfyu9>2Pc$Y!P>k1Kq;l&Hm7v{g4lmDjvzq*Nklk5Gt zSpM_2|I<_aH}U@!0sbK-ef-bc{wEOpoB79)q8D;0`M$$f-UsF1wD-B*|C`jiTV*5t zZ*Th#s@`8(`j78r1xiaw%K!gOBPaK{A-{tW-v`}4q<7Ohf3f0GMezPI|9>6yKgfE2 a9+|&fyM+icG8zC4@bKQUx0L?_`2PSHEo#F6 delta 7952 zcmZvBWl)?^y7a){Zo%E%oxwG@6Wrb18Qgua;1GfbNpP3o5FCQLyOUsFvUT^{-Fx?~ zI`ve&r=PC#r~60ueKAy62o$QS0uTlV001BW+>}*QP`QE9zD7k-Fu*U$M&D#%G*Q8P zy1F`^+gL;eumFGzIsicS2khYBXl&-}VQgXP!e;5?;bv;?Ve9NeE@38kB329?S?{M%9v7?GjOtMeX5u2M;RxBKF}+*@e)xtDjq zKk)pV;&Y`85uJSJMwK5KeRsoI4VXg^Xdl*hD9=4ZBHxI;c8|*X66tTJ`UU)IWa&gx zlQwXnlAzI+VYvybpnLHxOw%*DQ_%^Ok{u=UdG|610vkQ6omWPmW4@f7R-hvd*GjjCz=(n5s)p9tpXwIxf)LsVVv6{W{qP4Ox-Aa)5IQO zDc%unbl5N6n`&ba@i)LK0p)SdVWQv(De4_M9Hw(&V{+UDeEAYi$eQm_Xh-^}GO%NJ zu$5gVC!vK|xj(_Of=ApYk9o{CpxNa}twFsBXJeRrY4=udz2oId~+Z<_Y@Xe=pzQAXpx^CU89Tn2`)*KNio*Fy7O?ghH`{!P}NMC7rHf4 ztYBL%rJKwAf#1%vLLMA}9~p6YuRJ(wKL5%-x!}QkPh-fr@8WdtWfVLz>?&9L!{8u) z+3(ce1M^x@b1TAx=+q?Mu*`=`NxGU{rOFi*yBp^9(Z)!iW$=LFD=NO-7eCI z2|P(Ul(|BZF)p`5Rk2;Nwa<#Pe82Cv!=5fhe0-ZgCioLRU9_cNlB_MUga&d>798mW z`7-(No{8fHg23m=GtC7`$Z?4LiPR%#$$?6!nZmOIQVI;qhY^9qy{Xu=uNxdH>yPhd zgOjy;h{IopBWAhP!t;@=xaCikA?vVGMX_Yb@gbtlHfNZ=Mv1S?G_;irA+lv8w5F#k zgfdu#{Sgeobou83-Y;uALA=jWW*JF~XxWMIi3b5ZonQ+3Qpp&)WKRja{sEc8HPK`N zAp)%~9?eZOL0Qe=6^}t3>~5Qe#+Q^9>f(757jDkPJGX6}Tr5_XaJ_hVNyG5NvAH#6 zxjh~)-C+73f!8Go-Xcg6NU@~8C(=5BHNOr$)Z;RTTz3_7<*MyCL0K_*h{yK#b5M!W zef4{EhTw|;KkpSuK{^%Al{^xv0~q9|=FO9r68)W=oSH;UOgIyVPhIi+QF6R&w32i^ zb8DJEs(w&^95lOvv%VNAjOd4%cURZ?6_AU67n35zrz%?{4&*->*v}ekn>AyK4OiE5 z87Fvv6zjW7bM#Vi*3(c9-gA-{{*YH{)M|Xh;sE0to9W$75afD!g=)6%vrnaQh7E@s zQ<_~A^gYUXG2*eCNV06Pu;#&-eqO(OTr2}S5i7KZcTG@kKI`K+nI4 z9h&z@DG5MYrQh!nuWy{tvjH<^nKNqJs%uTyDHmV685ta0@E6N)D!&VtDU?+_VSGOT z9CTz@iuxu;eRV=6a)e6VgH;avJ`1*lCe#=l;}Kh1 ztyCtjV*SMpdltHX6JwW{=@}%Tv>$z0RsaH`j7xtRuv+sOSzMGBz`J)*v)0wDdH3^d zK|5DQqE57>{E_%>si41LGq~^|ptp3E856o9_jhPn92Bsjl8nqjD5?~(dZiBwKR8v# z^CO>&K6&f6ho}qqYCSM*D9bTOX3X?bq*h#x{_Ba zl=$@+g&CJ0}T?}??eRI#y1aL14*m$ZQ z*a}x!faNeo`FX*;rkN(w7PMC&;1oX!WrsU=%JCxM%<3iuTqnt_?HA zSOzS_|E94ES;2E8iY60V&&wlLu=ayd)9%`}R7V6OUM&^9aOn~|==Tss8&+Bg#rHhg z3a&n<)^=nP3q-9e3a|ts;alRvwp%nZ?_160y2dA}C)N`-Y>wC2%6RG(^#<*d6Qvkj zE&2($71QP(YQfgnYU`{n6Dw{O@uhR!!aWjMNo8}7y@RFB^R*ddV-Uk0U%a{z5LBrX z$5sUsVZ;>Tiap(>9)!y~3$n+h?hVmn@`lLbV01Bz6@#WozffuKt)&`hVgP^M23Gef za(0;p7>IGh1{g@;`$LC3BiCW(8IYLs)|XQhBTQJC!fMqSYtt5UDuPUUzC%6v7UHed ziC@Du<45~4UAyWTtYi+; zexUxCXh2h}77S zVe%3OuStHT((E-1b6+p#H!#JoS<5!W$b|r44#0sn}muK?jBcTyYv1 zg}T)Y|ETY_+vNEBN9wTJ66xwi0sAd<#X^TGbW3|OGI`t(;bbdonj-OzEFG;4pH{V6 zP@(++PzdM!Fo_JxjsG|%flus&yqxEIAjIqBbWkM|?*wVuPx}{b;!%7RZ}R ztg+4uF6l9b%9JqBP@{1Oow<1l+(6iLr2dn3p%=ldVWKNxT%Qi~8TAcKTaCJGJ3n>% z!wLx+GbYrVrPa*AH~58Rp435LQ{GPSFx@r#)za{}(8pC*@`Ow>jVo$yRJ5PGk_yz% zc58*&n>OXrwogxsWOt(m-bps|PK!PxUvOWO4D1fu$`x!A@wGA;JHJtvfm<1{UlZbb zg~#%LL_#D<bwzF1;u7)qNqX2$P_191p#J*WdN`Z$vEE30xXy>)_g0Oe2F5n?2f6ZU)_ngeu9R~H-r zg>^-c&G6<=sLy~I2P!e7=gmoMfSgu&eU5CKt@;B4fYr(!C;XW&C)@mxd8aH{JTF@` zjDEP5(z&=`r{LM`_;D1D-Pbyi3N1Iru`7%x$qWS~Cgh3Tb$Ar;j8ODKU9M|I*n7kn z-lXod5A2`A7Z<{L)0OTBk3a=2W>n6$2A{h_89ZPe99Ao3_!ij2=_q|A-%M70jBo2N zjrQV=bTKtZpl{^l(+nP9140pi1Y=?P>~LD)K8A6X!{b-6&gfT>G zT%nkZoo+_8c_5APN}4(kTv_hOczJtk=~O6 zckpkRtFEI@ZH#jx=3ExqI3E-Pi9Yv1^XRz+ds$SDI;HH1Bl_}T^JSdC8L?8iV#Mm_ zv^xy3_sJB2$LLa!QV^r!G6_n@QOZLT=dji`)Eg#l7X0>QT|XMzM`YE;iZ_FW`+l#G0VDt1?|CBs|jRA#${g8|=OKZDO64 z=DTY-bDH(&D96B5sJXi8J4@?1^^>ZHFA9!e-h%8DBC(S6la6|HXwOHto|oyb$o5NvzD zyT^8qM^P6WB@v2~y_R^{d^+}2Ak7{4#;bm1L^_-NeX|FSQifDL%CXahPtG}E?U#7r zTEw$K7z;DO_+Or<3ruu|xlNKHJUwlJlv?*>zM*?Nl8InG5`v@4&uoATbumD|`YIEw zpWj7J6Yj=5!yFcbH@MV8O|-9$Xnn)h9~~?HJ=5)&Oq#dC#MXu+&CiE%!0R=b6>mQM z!OBh$uNXn4R``MVgauTg{9ArFVQhM3KcH56+idtNigqN>f{ zU8tl=s4t{LJ6A15QuT_-8l3l@>saBt5+Jj z@h8Y{GrPcRnK{?+MeC$b&&7k_?>8{tAy}R(w{<#P;?hPAv70G|7>*=+TDs%*qbK#Q zFiFR&VYV7=ZBIICMrJ#tpi1|GKzfS7u~ZQBs7+;{kLHEDmjs=Tw4OnA{h0tLme@%(*Su?ix^0YS zx71RrNaL+(`gMa|ZF%7j+Ade*;|e}=u||Myif*Oy&1tSzoHVnDvp3kJb5SUyt2)#; zte;KD=&w(JI>)cCGoY5+OQ*_6)A2leYNTMP@CNH1LUvp&6lkV?^!K!&L z?7GbS>oDK1)EV@zX&kImS{fc@TJn?PopM82jw)Zs_SZcN7N|I>SiG_W_QFpFS4CctkkLN& zW)RY$lT=}yqFlI1XDT^HLjz$;LlwYkwJ?^MkRPULEZYFjW-n7VbX(|RHABAxIqs#(XQg9wV6T@`woxfPr;6Ef zp)rpP);_E*G>Z>`^Mp!-s5*igu<7{+6cI^B+fm^nq>Tv-v8yR75lx!OCWQo^g?#AX zLHXVTW+~qh>r^oPP+8G0VkWKt@}NZ@$4;#m`}lA*9mT^EG=vI>+lTf%7X?&T-pdjaLS`EXR9)&Fxz6m}fiK z*S8I7`(~V6nL_*Do%HxGb-zvf69hEKgRks~HcRAY8z1C6kD2?UiJWiI_N3in@!FN$p;-A6w0&U8gjOpCAI7)!w;ikq`D%gQrS&%osJyG} zs~*v+xpN(0Y{PFSH*jAS2tFb&-n14juOd0}RT5lJe`Solg^!i~#a%<(dn++{nzJ08 z6N1J(c&t%}*WS)Dc%1)YC~r35hms!x3jfrNy`mCW*3)e~8p1>1H?vPQR7z?QvUwvW z&A(U+DU2}j-`VTLCC8$!-94FHmde2o*v&&YZl>l3_p=EyBW5>;980Y8=Y)352RBQs zd>fz36!%D zbqY@d-E7RKVa2{#P&4^CSsv`Jpk|in`f24n;#wjiiR_OcYKIX*p-!3z61O@Jkk~gT zip(g$P@!ASwqw^WgtNj<(aqb?@6qX#vMtOJ$UjrQq)4@V?rw&GIJ>D-?u?jN0~qY! zw8NpE;`<$Z=2L1eM5!6HUdwABtt*^Nm1f3K5r@;{t6N#`b0$OQok{Lu1Jf82k#@Ly zeY|8(FAj&XIJue6$~h>WZ$0sNqmiIr7`~5#0Wg7&m}}50ffpm91r{p%D(k`mzHY}y z6kjb)QRWP3=*3f%T*Z(5-gQMj9<-9d9Kf^Dp1+Yqw_Zn^p@ zSwp<=K~*~nMws7B%@AF>sPw&QaW`p<^#PY-N1t0h7#A9xKBH~~`A3GlUJmMBo?Q-t zi(hcj6}_23W}@G1LPZ#gvy-LC0)e2qQckADrqa;!`#c5>1_C~F_zCfQADm}=P_h<0 znhI%*^#%O&H|L!rC7x5;60?dYCqWvJ(93McO50u_lt%ZopIeXDs}2GYKRwk z%)zv#kvZ<qT zr@4oxo8_MZ$1$KO5f`BxVvy-^Yme)vl~o2iTsX9Q6PL+Ji2ju z<7aW;k5lvHm>_zR^W9s&01bsiM&k1vK~+&UE?blxu|awYQd$QNaaYm+MVOS{#71Sm zP4{eAr;OPm@XKzENzw1#56e8>!X(QKTXj3veu;;zax~gBBumz}70E zY&5U`lW@ZSEx4FS%pvKrod|!XPSIzMbBLatE=w(;X(mvfx%O1WTl4!#d$#+*xW?7Z9o9 z*h1G=)yCodddyZs^3%%wr#S6r1fkjtlVqY-TC-0?z|M4becQjTou;EVVy}JhB|xej z^rB&P4BNC3Y&KPC?gR4+WT5ES_*8+9R;sGqhly$t!Ti=tCc?*GG7i z-^b+O@vpNqAI5c5Kn|h%Sx+@~Fz7=nE* z)*j8OSUkb>zLbCOM6Ki_fsnbC?h zx@v4=a?O}vZ(3icC!!NnC||VIu5!?`1$`16jAf8o6@@vJolF_TJ&?L8o%YPMD<@Z! zZY)Mb{_(#-6_flg0a|9iC)%F{ls|{zKM2qS{{WdF n7-}g00RBtB008B`1EI|S0!blyYG`OEe|X}60KhirFXI0Jk}9e# From 169b0d4c503abc2ebfb55f62435fe0af1d29afef Mon Sep 17 00:00:00 2001 From: yike5460 Date: Sun, 5 Nov 2023 12:29:25 +0000 Subject: [PATCH 17/21] feat: remove blocker to allow pdf be normally processed in glue job --- README.md | 89 ++++++++++++------ .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 16515 -> 16515 bytes src/scripts/glue-job-script.py | 19 +++- 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 85164015..cf38aa8c 100644 --- a/README.md +++ b/README.md @@ -44,80 +44,85 @@ Now the object created event will trigger the Step function to execute Glue job Use Postman/cURL to test the API connection, the API endpoint is the output of CloudFormation Stack with prefix 'embedding' or 'llm', the sample URL will be like "https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding", the API request body is as follows: -**embedding uploaded file into AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding, will be deprecate in the future** +**Offline process to pre-process file specificed in S3 bucket and prefix, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/etl** ```bash BODY { - "document_prefix": "", - "aos_index": "chatbot-index" + "s3Bucket": "", + "s3Prefix": "", + "offline": "true" } ``` You should see output like this: ```bash -{ - "created": xx.xx, - "model": "embedding-endpoint" -} +"Step Function triggered, Step Function ARN: arn:aws:states:us-east-1:xxxx:execution:xx-xxx:xx-xx-xx-xx-xx, Input Payload: {\"s3Bucket\": \"\", \"s3Prefix\": \"\", \"offline\": \"true\"}" ``` -**offline process to pre-process file specificed in S3 bucket and prefix, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/etl** +**Embedding uploaded file into AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding, will be deprecate in the future** ```bash BODY { - "s3Bucket": "", - "s3Prefix": "", - "offline": "true" + "document_prefix": "", + "aos_index": "chatbot-index" } ``` You should see output like this: ```bash -"Step Function triggered, Step Function ARN: arn:aws:states:us-east-1:xxxx:execution:xx-xxx:xx-xx-xx-xx-xx, Input Payload: {\"s3Bucket\": \"\", \"s3Prefix\": \"\", \"offline\": \"true\"}" +{ + "created": xx.xx, + "model": "embedding-endpoint" +} ``` -**query embeddings in AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding**, other operation including index, delete, query are also provided for debugging purpose. +**Then you can query embeddings in AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding**, other operation including index, delete, query are also provided for debugging purpose. ```bash BODY { "aos_index": "chatbot-index", - "query": { - "operation": "match_all", - "match_all": {} - } + "operation": "match_all", + "body": "" } ``` + You should see output like this: ```bash { - "took": 17, + "took": 4, "timed_out": false, "_shards": { - "total": 5, - "successful": 5, + "total": 4, + "successful": 4, "skipped": 0, "failed": 0 }, "hits": { "total": { - "value": 890, + "value": 256, "relation": "eq" }, "max_score": 1.0, "hits": [ { "_index": "chatbot-index", - "_id": "038592b1-8bd0-4415-9e18-93d632afa52f", + "_id": "035e8439-c683-4278-97f3-151f8cd4cdb6", "_score": 1.0, "_source": { "vector_field": [ - 0.005092620849609375, - xx + -0.03106689453125, + -0.00798797607421875, + ... ], - "text": "cess posterior mean. However, we can expand\nEq. (8) further by reparameterizing Eq. (4) as xt(x0, (cid:15)) = √¯αtx0 + √1\n(0, I) and\napplying the forward process posterior formula (7):\n¯αt(cid:15) for (cid:15)\n∼ N\n−\n(cid:34)\n(cid:34)\nLt\n1 −\n−\nC = Ex0,(cid:15)\n= Ex0,(cid:15)\n1\n2σ2\nt\n(cid:18)\n(cid:13)\n(cid:13)\n˜µt\n(cid:13)\n(cid:13)\nxt(x0, (cid:15)),\n1\n√¯αt\n(xt(x0, (cid:15))\n√1\n−\n−\n¯αt(cid:15))\n(cid:19)\n−\n(cid:13)\n(cid:13)\nµθ(xt(x0, (cid:15)), t)\n(cid:13)\n(cid:13)\n2(cid:35)\n1\n2σ2\nt\n(cid:13)\n(cid:13)\n(cid:13)\n(cid:13)\n1\n√αt\n(cid:18)\nxt(x0, (cid:15))\nβt\n−\n√1\n¯αt\n−\n(cid:19)\n(cid:15)\n−\nµθ(xt(x0, (cid:15)), t)\n2(cid:35)\n(cid:13)\n(cid:13)\n(cid:13)\n(cid:13)\n(9)\n(10)\n3\nAlgorithm 1 Training\nAlgorithm 2 Sampling\n1: repeat\n2: x0 ∼ q(x0)\n3:\n4:\n5: Take gradient descent step on\n√\n(cid:13)\n(cid:13)(cid:15) − (cid:15)θ(\nt ∼ Uniform({1, . . . , T })\n(cid:15) ∼ N (0, I)\n¯αtx0 +\n∇θ\n6: until converged\n√\n1 − ¯αt(cid:15), t)(cid:13)\n2\n(cid:13)\n1: xT ∼ N (0, I)\n2: for t = T, . . . , 1 do\n3: z ∼ N (0, I) if t > ", + "text": "## 1 Introduction\n\nDeep generative models of all kinds have recently exhibited high quality samples in a wide variety of data modalities. Generative adversarial networks (GANs), autoregressive models, flows, and variational autoencoders (VAEs) have synthesized striking image and audio samples [14; 27; 3; 58; 38; 25; 10; 32; 44; 57; 26; 33; 45], and there have been remarkable advances in energy-based modeling and score matching that have produced images comparable to those of GANs [11; 55].", "metadata": { - "source": "unknown", - "fontsize": 11, - "heading": "3 Diffusion models and denoising autoencoders\n", - "fontsize_idx": 2 + "content_type": "paragraph", + "heading_hierarchy": { + "1 Introduction": {} + }, + "figure_list": [], + "chunk_id": "$2", + "file_path": "Denoising Diffusion Probabilistic Models.pdf", + "keywords": [], + "summary": "" } } }, @@ -127,6 +132,30 @@ You should see output like this: } ``` +**Delete intial index in AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding for debugging purpose** +```bash +{ + "aos_index": "chatbot-index", + "operation": "delete", + "body": "" +} +``` + +**Create intial index in AOS, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/embedding for debugging purpose** +```bash +{ + "aos_index": "chatbot-index", + "operation": "create", + "body": { + "settings": { + "index": { + "number_of_shards": 4 + } + } + } +} +``` + **invoke LLM with context, POST https://xxxx.execute-api.us-east-1.amazonaws.com/v1/llm** ```bash BODY @@ -168,7 +197,7 @@ You should see output like this: ] } ``` -5. Launch dashboard to check and debug the ETL & QA process +1. Launch dashboard to check and debug the ETL & QA process ```bash cd /src/panel diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 47ba1d0c008bf739472cc4173c1db4035e7ae328..d419e81ebe94cfe24393bbc29532b00bc2679f49 100644 GIT binary patch delta 74 zcmZo}WNdC^+_2Yzxg|Mu@&SvzV7kF_2AGz%3Ifra3#<;ZfCWtK{lRpby$_haY99io HbsWM0GMpRw delta 74 zcmZo}WNdC^+_2YzSz9)F@&SvzV7kF_2AGz%3Ifra3#<;ZfCWtK{lRpby$_haY99io HbsWM0f Date: Sun, 5 Nov 2023 14:38:44 +0000 Subject: [PATCH 18/21] chore: update aos api function --- README.md | 13 +++- src/lambda/embedding/utils/aos_utils.py | 71 ++++++++++++++++-- src/panel/app.py | 6 +- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 22853 -> 23209 bytes src/scripts/dep/llm_bot_dep/aos_utils.py | 49 ++++++++++-- 5 files changed, 119 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index cf38aa8c..7fba9062 100644 --- a/README.md +++ b/README.md @@ -147,9 +147,18 @@ You should see output like this: "aos_index": "chatbot-index", "operation": "create", "body": { - "settings": { + "settings": { "index": { - "number_of_shards": 4 + "number_of_shards": 2, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "vector_field": { + "type": "knn_vector", + "dimension": 1024 + } } } } diff --git a/src/lambda/embedding/utils/aos_utils.py b/src/lambda/embedding/utils/aos_utils.py index 6b80cc13..7d1cb88e 100644 --- a/src/lambda/embedding/utils/aos_utils.py +++ b/src/lambda/embedding/utils/aos_utils.py @@ -29,13 +29,37 @@ def __init__(self, _opensearch_cluster_domain: str): def create_index(self, index: str, body: str): """ Create an index in OpenSearch. - """ - # create the index - self.client.indices.create(index=index, body=body) + + Args: + index (str): The name of the index to create. + body (dict): A dictionary containing the settings and mappings for the index. + """ + body_dict = json.loads(body) + # Extract the settings and mappings from the body + settings = body_dict.get('body', {}).get('settings', {}) + mappings = body_dict.get('body', {}).get('mappings', {}) + + # Create the index with the specified settings and mappings + self.client.indices.create( + index=index, + body={ + 'settings': settings, + 'mappings': mappings + } + ) + def delete_index(self, index: str): """ Delete an index in OpenSearch. """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } # delete the index self.client.indices.delete(index=index) def delete_document(self, index: str, document_id: str): @@ -61,14 +85,10 @@ def index(self, index: str, document: List[str]): logger.info(f"response: {response}") except Exception as e: logger.error(f"Error indexing document: {e}") - def query(self, index: str, body: str): + def query(self, index: str, field: str, value: str): """ Execute a query on a specific index based on a field and value. """ - # extract the field and value from the query - query = json.loads(body) - field = query['field'] - value = query['value'] body = { "query": { "match": { @@ -82,10 +102,45 @@ def match_all(self, index: str): """ Execute a match_all query on a specific index. """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } body = { "query": { "match_all": {} } } response = self.client.search(index=index, body=body) + return response + def search_with_metadata(self, index: str, query: str, filter: str): + """ + Execute a search query using the query DSL, using bool query to filter on metadata. + """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } + body = { + "query": { + "bool": { + "must": [ + {"match": {"content": query}}, + ], + # looking for documents where the metadata field exactly matches the value of filter + "filter": [ + {"term": {"metadata": filter}} + ] + } + } + } + response = self.client.search(index=index, body=body) return response \ No newline at end of file diff --git a/src/panel/app.py b/src/panel/app.py index a0cf9e3d..9ac7062e 100644 --- a/src/panel/app.py +++ b/src/panel/app.py @@ -118,11 +118,7 @@ def pipeline_tab(): request_body = { 'aos_index': 'chatbot-index', 'operation': 'match_all', - 'body': {}, - 'query': { - 'operation': 'match_all', - 'match_all': {} - } + 'body': '' } send_button = st.button('Send') if send_button: diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 5d82071ad58e59433fd63f0746c24ec0eb9cb50d..5dfd44ee93ae9d32ac415985e5fb7773f9fef73e 100644 GIT binary patch delta 2597 zcmZ9OcQo7k8^_K;_MoNqz z&FChWt08C zcSa;ks1Z6=TzL%a6XP>)N9{guAaGbU7!~&>I(lp1Wi8)Y$(%2Xu(iAC5E>20TyEgy zIIh(7epv11uK3IZ1l&=kQS$_HJyla0xF$;E-op=cn6<9 z+AfLn=CiE+K8k-BFbii87|~c7azp0Dyh21Kd<^od>AZK``gj-%g+4-mZn;XwoE62`TQ*-~I+4OXl3kYfvF*hGbfo2YBs-`5JHR5wfH$30bg zLd4$?Qz$HQX1zTGUFd7EqF;7dx|kiWmf!}KNhMSb;kfqC@8J3gpT1f?o5aMWaae9u zF+Uj#@wm9Mt6l5lR&s9Q=@ARPsl0lnL$x$7)3UvnRdUsYnWd-UNN^3rIoe2f&dz;r z`#4&s{EN1>2ghDID)bmz=970; zB1hQ^>R;F^XmS*cc%cPcIoCwo1ZKi&lV{>|$`x{{$8~jwxnqT4VD$bR*NBxmnCZ1f z)>_9=w#O2`G=%Lu2k7Zte&ZDadpQ}3$!X6y;^rc~n zOCHsTMWz=M;tL_VO4=qDvr#UvT9jOl&ejgr+0Q2+wE(D0CJ5|mLn=Wvcu2Q7gLFyv zwY%fGI0ciX6kw^Dc^MpBYDhR9fW|2f~r@)QBde>^x_BbK1aq(H%%IZ!GFmLQ2B zV5GSE)pJGJFSO-fHp%1(bmqozCs}}PSGPNDccIES4t~=Atf>b9=^QTC?+J_1&VbFh z8aVD0imy>+h9jL)w=3{BFU5`%wb)IwimLnXxMMTd>)-xv)&(8r+<;Ty86pQs;MR5G zud8$S@2u}9c6~g0#51$1R1SP{gk;j^LQ^RW?(DXqpgPPtG5=DcWVHonL6d` zF3P^LQYGlsl-m|;rO9C28qZ> z_S>35*rR?7rDp|#uZs)g6^ruU2#u-jbc~FmI%3x%*V+lS-PgIy&cW3Fu5lBBwnr_a z#dM478LNnC7^I10_! zPO#$-q7}T{+c?x*?0pYyRzHBjhW0KuXHtU2Ql$n4HJ7SKwXeB0iEK0whX=hyUdbJA z4o5UKmP91=49OkP1vN~w419{B=dt2qvVFm=k-HrnOgCj~D{(-G8R^Va z&7Yjvu(>s~^7P6&FnRTw0WucKcp?GA#xx(Lb77`eTK-N<+CWiJEK!sX<9+@)+ujOo z8khK5Y)}1a91Q3@ASctd!BJg;-)YDg#o7Ea8U1; zti(-c@8ukghcVcEsn}A#1(x_gcq8;jLiklWIBw&CxrzUNF z_62fhQ9qUeOeVchBN;#yEe=^Sw8^!Oczucig_e(^7V9wl6`LEr>lomZy)dw`nDs#! z;Tv-=)2&4M!OI}Y!*%@QoTZzUImQhJLvXqDQZA8z3B**&G$!9VLl|ieQ+E@Xr)q@y zl5USne1wWF|JFAPhWOkl(_t{!urc{YjB}6x%d;a4HveKnI&@G_xUiV*M46!c@2ZX4 z-D#mx6{r;)3z429>l{j_2~xYU;*0ZEktVe z{a)YQoDP46wimeDDlQcDJjwa*u~p$eHFra6I1r7Nhi zRv}yIr;3@Wcn)?!&?_U=H>O8e{$1UAdbN+^VYtLGO&N#lC%CwN%zG-TJA7PrEZy|- zJ<)5~`_I~a3F}|=*&=0|84S4H;%2#C*l-no2)9xr*Lr)DUMU`@v3NaSN?~ix^vrX0 z%Z#I6kL=&C@iwzI?tWL?sP_0CI6);DubM-d}z|-8*vhbgXPzRBF)tt=lc>Z41<62Z4rEHsV_?f4IH&o zPVTs+r(mfRBR1ntS&Rl#vj!L=K zn1ryyDcC(pow`7iXOM>R(8HXbdy2BLKc*wN{gww9w%2q3g0Hy$|kra?a=dI*-ryTiM9kO<^S(3to=7?($ELunj633(GHo zY&!_H8(|gl<(zOQ-9>mwPDi#Gzu@}{uVYikii6iG510xIH)vDF%*lDBIn8$uWAQ>A z-V<~fZ!T9FXSWuNBE+fAxXqa13;!YqeV&2dNHhx~K37?Y$A7G3FE|1r@ z`c|vy=Vwi?sTt0Wr=@Y;|Y=-=rcd2kvyYG_#crd<+WaAIxSi z0BVg9M#GwDs!FPGBS**5M!y+C9A@K?aj{EB&ZiY*5O5`G0UXAOIPWHhW9_aQWHG~6R5)pxi)=9W?& z$c3dK-96hX=J&U6;ncoAH2nmxz{bYxdL0t2rJpD<8#kRF@iy>QTs=*B*(+Y&u1GYP z9ug}R*tC~jiOp->tIBNn=|mk#n7@9(|LZ82{!!c0SSxyi*|6_e)<)4!9 zXsP4X5Y|S>&z@IK!uc7IxIj1slNEI5uhr_NnbtbF-!DyWkxW;gnhypOlr@%VK`idf zn=M4Mf9?DzA@PIYO^!Gu7VHEKWuMiE85vGBd|0`gC~)X77omoVnMAthcr;D*IG&ES zC?1@f=SJCMD)%?yL1=>BWYPjmwq@zyOjII=tr2K-@>+CRS<3XLR&&& zL!$(>06JS6?MoH&&Gf;1>2m!*Yb{{+E{GB()iuhf-^hkP^VY}0aj<#=2z}kiQzpgk z&8j(Vsz<{E=il<_IK&g= zJ8i!3>XgB#(6l64dyqck;{8?s_tqj&2XRr4qG#Pw)o5enX11S zCadwpz|X~SrLBEMAw&fYh`1GMPFY{A@$>uJjK3#E}S0woYp|Pg9bK-Y$?ex6K}{j(W?Zzop_J8LKa7nle~%o zpB9<>5rSzoExG45C9yO6E5`P7gy=hNyz+7a-fK|&9RGtUV3CpP-D22hiCC_gV=^_} zl2^+&B$g4lB!R-GUQvi9CR;{-R?=QgT0K=!su)YX&QSxsF<}^W zFa$Ln$9QMeEMd}LN37@2=^wVn@)9O0^OpUt)LF8-%9{~Lg9_-{+aJfGA`&<7x!X2z zC%){U-?bDvk-}R!$3=Kr9@~YBQabbWpL!MUH|m0=xSDk%(6MncU<2$Vjc@!ZEs>x{ z3Hp>HL?*W|tf9!6IO*9y48Blg(g;>Uo5(lWK|q!^PVrt%GY(BUv%V_ zV;(I<+83ks4b&_zVMAHCWqPG_ndt%6SNtR%(h_S*xN`d~UZ2E|^O&h3p;5^z<<6z& zAKl(DA)aFFZj#d|qXgM?r%)b(N+;*OCZbZwplPK7b@zr%mt|pb{CUauMmB%Rh064* zb#&$>u?5BU?iQmW9W7puls9Tg3bjj3XKKaJS8TP3(CnUMtZdBVy-EFpJ@Ru?@ab@J zWar=YLfu{+#g!IXGVzM8az6R-?UBHaiiNhu3EV=3OBie392oYA+>=i3AW!Gb5-5v= z!^j%~1n)(LR%$8gbI9G>UEEXUAHG#7o;l-H`e4sA48mh-f9kMgg--B3NU0Qw6dy<2 zi}QUl;jwzr?ni5rDXwA4`6DCVwN?OFPFFBXUjG|)ueJ8BkgNJe@*yS&rpC*kzO0bv zV|h1Xe{w5M!Ehs3@fM`dma#<#|9#zweYbv1Y3OS<&+I4`;BuRRvU~{b#{QhYJWjifKB3qVzd#p!Fz<>HdkoTs1)9Hh>yZ z`S=7Nky0R(Enw&X0q!Hkj^){rDA@gx??{nj**lWg$D#_ew_*i}v5B#;uyC{d?{XY{ b9Zqtxu*e_jf2)C2+#U#V)8-^Q9@D=87GVM{ diff --git a/src/scripts/dep/llm_bot_dep/aos_utils.py b/src/scripts/dep/llm_bot_dep/aos_utils.py index e1394333..7d1cb88e 100644 --- a/src/scripts/dep/llm_bot_dep/aos_utils.py +++ b/src/scripts/dep/llm_bot_dep/aos_utils.py @@ -26,16 +26,40 @@ def __init__(self, _opensearch_cluster_domain: str): connection_class = RequestsHttpConnection, region=region ) - def create_index(self, index: str): + def create_index(self, index: str, body: str): """ Create an index in OpenSearch. - """ - # create the index - self.client.indices.create(index=index) + + Args: + index (str): The name of the index to create. + body (dict): A dictionary containing the settings and mappings for the index. + """ + body_dict = json.loads(body) + # Extract the settings and mappings from the body + settings = body_dict.get('body', {}).get('settings', {}) + mappings = body_dict.get('body', {}).get('mappings', {}) + + # Create the index with the specified settings and mappings + self.client.indices.create( + index=index, + body={ + 'settings': settings, + 'mappings': mappings + } + ) + def delete_index(self, index: str): """ Delete an index in OpenSearch. """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } # delete the index self.client.indices.delete(index=index) def delete_document(self, index: str, document_id: str): @@ -78,6 +102,14 @@ def match_all(self, index: str): """ Execute a match_all query on a specific index. """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } body = { "query": { "match_all": {} @@ -89,12 +121,19 @@ def search_with_metadata(self, index: str, query: str, filter: str): """ Execute a search query using the query DSL, using bool query to filter on metadata. """ + # avoid NotFoundError: NotFoundError(404, 'index_not_found_exception'... + if not self.client.indices.exists(index=index): + # hint to the caller that the index does not exist + return { + 'statusCode': 404, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps({'error': f'index {index} does not exist'}) + } body = { "query": { "bool": { "must": [ {"match": {"content": query}}, - {"match": {"metadata": "true"}} ], # looking for documents where the metadata field exactly matches the value of filter "filter": [ From 7027c515eec215aa67cf1539ef8329ad02f48da9 Mon Sep 17 00:00:00 2001 From: yike5460 Date: Mon, 6 Nov 2023 06:30:03 +0000 Subject: [PATCH 19/21] chore: 1. add full file path in metadata; 2. adjust sfn timeout and embedding chunk size 3. tweaks in doc and logging --- src/etl-stack.ts | 5 +- src/lambda/embedding/utils/sm_utils.py | 2 +- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 23209 -> 23172 bytes src/scripts/dep/llm_bot_dep/loaders/pdf.py | 49 +++++++----------- src/scripts/dep/llm_bot_dep/sm_utils.py | 2 +- src/scripts/glue-job-script.py | 4 -- 6 files changed, 23 insertions(+), 39 deletions(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 33b9d6ab..954a6886 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -55,7 +55,7 @@ export class EtlStack extends NestedStack { // Creata glue job to process files speicified in s3 bucket and prefix const glueJob = new glue.Job(this, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V1_0, + glueVersion: glue.GlueVersion.V3_0, pythonVersion: glue.PythonVersion.THREE_NINE, script: glue.Code.fromAsset(path.join(__dirname, 'scripts/glue-job-script.py')), // s3 location of the python script @@ -152,7 +152,8 @@ export class EtlStack extends NestedStack { const sfnStateMachine = new sfn.StateMachine(this, 'ETLState', { definitionBody: sfn.DefinitionBody.fromChainable(sfnDefinition), stateMachineType: sfn.StateMachineType.STANDARD, - timeout: Duration.minutes(60), + // Align with the glue job timeout + timeout: Duration.minutes(2880), }); // Export the Step function to be used in API Gateway diff --git a/src/lambda/embedding/utils/sm_utils.py b/src/lambda/embedding/utils/sm_utils.py index aacace3b..76675faa 100644 --- a/src/lambda/embedding/utils/sm_utils.py +++ b/src/lambda/embedding/utils/sm_utils.py @@ -16,7 +16,7 @@ # extend the SagemakerEndpointEmbeddings class from langchain to provide a custom embedding function class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings): def embed_documents( - self, texts: List[str], chunk_size: int = 5 + self, texts: List[str], chunk_size: int = 500 ) -> List[List[float]]: """Compute doc embeddings using a SageMaker Inference Endpoint. diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 5dfd44ee93ae9d32ac415985e5fb7773f9fef73e..4c55004295474f583974132cc1a71ea4dbf873dd 100644 GIT binary patch delta 8071 zcmY+JWlSARw6za#aCg_@?nPSM-QC^Y?cfxb0~8MK?pEANaf-V;MT)z8y>D`p@6Mk! z`^n7O$^6*KWL304g||S}+frc3&{}CD2tr4v0RaF23IKo)0Ox2Cf>S9lh-v%@*a=)h z3XZ2yY~h&wxllsV6D)YKDxx(S!Mzmt#Q4?vXZdH6we2`aax0>BKgzcgcyM{oaubAO z!B-S`ATW_JkF!g_M`$P} zA!RCIEv1F>v>k#W6KAat>IkWVUUt>_W!R7qIa6n;3EPP0Ul)?Dmrko2Ttn`&EWWsy z&Zu))3oqX))>Sa7_5xeC&36aW(jIq-Q*;|B4Xkt604BP--Il}Gc&j}V}5mUJe__8 zM3Y4B=bFdxPP$~`))``4TU)J#f=`}P{KjBEdh67d%dB=6l%lw154?sq>TZ9w=B=%3 zg{qZ|CXFE}(~=1ropv@Xv1&RbG;ZlTUNPpZ36@wH;+ehLw7Iefd9CmEO3vMbdr~YroI5Vr5ngfC*5?Ug(17*F>Y;{GZE&q*p009ioYlagPa~JjNAb)l zT2)puw=WmkY^YUnOhUm|l^wtOjJ_!Jp7uGe9y)o3c-oXWt!`GfAj2WhC6?97DPgmI zH7Rdw%0%@WUNhn-N*^X2>a>`o5?9G-`M2F>kyF0rgh8LkV;bis?lx3hNcKKoTQt&< zQduOgCvVww193X&SKE37YpQpj4Pfrh(FrzcBE5SZG#GHK)nt^;Gc8PTyR|#c zLscosMFOXvbz|lBq_2>)kU2|~U@ce%d{3<1JB2Qz`9AFHsy8HFFNXF?0P9~g`!YGX zlnG1VIO{zS@(cHkknl8dGb2(uTzX|`$Rwp!&U=Blk_l&7b#0E-jXxmN`rx(nFKLE; zKYH4{@e$Qk%rZN>-DEa()5#Iy*ELsTvHsYvSL7=4FhFpZ7EzNV%^dHtE} z;tTi6F%SNwW=}af4aLH5kdnlC(`Q*AX#6+%xB$a7``t<>MBCI*L#T8A!_J57Dp?{q z@`-gdN|x-WBsCy(FN${0x8|xxbqgME`o>c)oTDzh;P0gJUvReer8E8e)6?5;;klOr z)SNp}LTA)}4`te|x1y_av*yP##$5s0(eN93sqaXL&3erv`5pk5BoE6$#Bc$hr9wS( zO0`8R$aY}t+cE9D8Tfdzm%0tu6N_b9zIwLyJFHpq&A7Vr6CkqeLfflHTjKdW#$Clb#2$)nHIFH4<>PDbVjb7SbU5CM^{O*)E1##M5_p9`aIU?T(+Q>UIp z`o*Yre$QrWf9aH3Dts&OGjAV*ago)w`P(Ig+$n|;@7`_3LRW_rXZg(rr=$Sm3b{bl1BWxDd=5k8M)X9j7?(KM4NIA%#Pt<9xq<`_t} zseyP|xk(o)i}fdz2RhFs{e~E1H6G^}8)K;($2f&MG)%*6D`<*uOW@kdHmhSg5=-m_ zo_{Hq{C$vJ<6`DHU`dkd3i*ree|2;+{Ri?0LDT}$|cu2O&l zk~ryK;BY8H5okp0X)yD%mh@8vyhyaVcW3_7;JYKHo0t&0Ck_^PUOl%F;Yq)!$U>pnk9OyFXW=1@YNCZZ7 zH z6Qpwv-fq$G;&f|k0lu<~%`Rx^-v>7Ec^r#>$~{r7L9WuvPrmIM9YxG~bQn;&9P|*7 zb~8e}q_Bb7Mlh){&&AK@A4`rd(t|(JER~ZBmZ>A`g+*R5hGm0CT_Z2>{z|R@0s2hy z)!g-WY0<$k2o{3iKMHaS?*gqRoHec{*fEUxAri_U_%w2%%{k4FAJPJDu+%$--Eb{_ z5^eCG-8|Q;8|E=Wqu@O+Z95+HJuc7}TefP=LD^y+VM6WLk^y zm_p(*&GZ2E?hrBpOB{jx`6-V4g(UKPEG=e*kXn+BV!nM(A7p7ndtS5~nDHzJ7>psj z98bDxReer5_HEk1qLnk#{G$+@B^vlay`_?jyxh(8bzmly%Mw6TRP|hmAuLnYPsw(& z%FY_rhy}!!0e=P5x!e&CqO#RRUK0tLG73SsBp&<;z#jP{QbqZo7Swmi6KoGBi(!pn z`aRy$@`B_oD?rH(Adoj9$aZ{S*;H}x5=@~X)Sf4j zd2C!H3nQD@!;T=n(*c(r(uVe~zk?$628=jS(f*bx_H~rB9dKI!O$s3KMh;G{x(57kc#^O z!HH+@gTuJyd+qc$@1Wf*^t@8QWJpKMYPeZwcK&;C0B~MTbdw1}M()2!gORw3Z`nBe z>;QUr)o;ZXf0u*xEW$%6r%Ev}>2@D;Pc(w>thAB)+hzX9L&`gGQxVZ4A5oD1UC2(; zvxn_SLRNSs)!F)}iYD0wDDv{%yec(@N!gT2Mm?DxVyKf(dI@r3LI?RwAZ6!Kbib#}O&N>yWFf_qGBf$Y>TnSb zW7&ng$G*bt!ecNy%Gy7pefs{jE=5n~8yniW@$p#57W5&4l*kyln4968*YR75@e>8V z8~V8-KWQK;fh8hJ_jS*=_@#B1>Te}Dd;{3)gk4t4#XOtK$|T{RQtDn??xmGVP(FkF zG{YQMxm$96XPJTT8S`dHw!N1zZD*OaagscC`>MCxbt!K>B;BGHu~`ckREaR6^D#wU zCUcL%<@`yPB}cE-GZ3=+yfuIU`kInFOSCm~T9%s>Bh{1$^(Qa$eP2f(lkZfMEk}kO z5h#(cMw~<9Z4Mv^Rn2Q~VljO_u{)salr2uzso-d#m(8Uprxl8)P$9TbvT-RWBlY+l zjaleDc@*jJ5HB?g>dG*Cphk#BZYd958z!55l4%gFeOrxD03z~T*j>k1(88hVNEM=X|{0I<8nUe`I2xPC7O8(;1;XTKwb0pt>s~u z7S1RM%QL4)+&(Yi=k%Se^V-|$zFt#jM5k8rI;r8m=@tYkbN3FRm7_1-LxrR#@svhJ zI`*uahFv8`U6s$jId%+gW8&tZ005ErCv{+11`!+uP^> zD)gJ&qO+i+QMy_vTb9>Qh-HYcNlhQIOvXaAXhC1!G}>^W;E}rluZ!2# zMU|k35DM*v>}=3)Rxh#^l=~?*VGWRIqT7ufz>?)W%zW_%G?RP4bjlyLJ!PQd-M^R{f{dNy_O!idwn|i1I)l<4? zg#r4Zb0~3D37lmbAZ*4n8fo^C`YL27n5G(Gm(^Ah*V?>#TJsQ`?gS<^x2&2J0&F_B z??H?{1_tV0&cliMz5wRDwlGKHg_+SticyPpVh3NO4nXl7bu$*!gw*-P&W(SSTbHbp zd_^fDN+Rt^k7Q*G#k8x39ngjJvR39>!hlRG0A3f1fn^uO<>ii`67T(Ep9!ps&WJ#Rv#s3A{+8<>! zcuT}squt3qoQida@wh3$Q(4G*wdW-Vr6$u*SV4{vdk)+C@KM>BSmiMn&@BD?w z)v-SB`~G)4k^B-BZAm7rgWa6)8lk(016`S&X9k@_atNqHA%aXM<$oUa&^#_tbsiqk z0Wp zy~&Pg`>LjHtHbir9O8UI>a@Q4;5AOOZQyCRqBn2qah@=8tWD39eMUswy!XVewKd_{ zg0Uz#i1sLC<^J7JJ}l5vSRtpja@9kzh70tmElGNoHK*WfChO77%2TB}+^QbRvul!Q zRS@b6M{cbj6WMOwlTWtg)~^QV?QB(JeQcf?v{Xu58CPDHP9*_$QTD(J#|OLMzy50j zGdvBvG%O9bQ1EgKhW&}6>lP)K%p^hvI3c;hII1bkHq0Ye%tT?g^ccK|-*k7+J6J)B zk8*Gv-ByfmFGLEE32V=jbDe&h8wA7Ske3E_;TvxE0t6VZk{y4v8r*bEDURiw` z+bD+DeRwN}!?=EFeM!|tWx%2RgXAxKI%MD$Wss=?9*=+gS}03OrKgefm!wHx=cb~d zK2B>*$m*DB6bbw@%NocNtZ(cGJq*eWzaT7MfmuTsmFw4qo@VV>>;K!g>S2}-PX&gE3~-mUkadMDdA}^?iH-(6UH7>hCtkZ5ZWoz z{V*$L=-s)Y#?0o_&L}vdWV9WTAt^6!Uk@%dm?XOys7opK&Y>QLD0SSuikc~>(1LBH ziMM_kQUk>s0=*$5MQH7LwsUPi+nWdss5=des`eVPOgSyUYUH<>(CLB5J~+;UEGv*TQguNV<$2 z#KTR^R|maKr2y53OQ#|AOWGu9r=e?y1${Qlnf`)zNXk33f+ZT=ELzhi4Oi_kG-{2IBFRClHL)e`V-TAGPl8lxu#!m$o{geK(&2 z>q+N&9_jmepPfLzx^hs9nsXaN5{Yy1$}KDVBzP62@yHU^OhM{+g~OqexNTJ~bQJF$ zyE@e!tto~TUXxOvFpJT%hfSV8<7kHS@F&E4RLKI2aO%yqR4OP(>pYu7D2Q24VOQbr z;y2W9M^wM1%?yy^Z#4$I*RwVwWM?2y$F4rjw@BL#`{wDRk3RHTzTG~|){5Ska3d9G zk4-5(CJ|C1OMxJ15laSy*s*X$5nq;J?Y5&Eq0XqbsJFkz)6t&NS5K#Y`_XUAvyc6V zVb&kPi8rl)!d9ZMO|;s0;LXjX%)`)I$M32TE%y*kqV1n=W^)t)sI#l7q=?lmpfh?L zx#wRav5R}@{dj+d6TA8}B$YIS)}^|6=xFx`v1(xe4Gwh9NZ<)fStXm{gsMCnk9gax zRBipc`jZtwu11YV)QN>>VYX}43;&;_p)lbj&(3^U~hOO4Z9Eu zxeE*+Vt>c+6@r$yIBj?vhUW?0)S{`)#9AJ{eIP}By)Edx1y+>6L|e7U+ChBOdf_cD zv*+c2P9P)-bMJoIEZUC44jiPaZFMdBQ0i5s3Rq`c!ef~JzENtV0RV!A<07GYzi~4T z_fsIVtWggGWW>?yqql6F#LQ}i?2M<(0`H!jdf=YN&Cd(p(fH>y39xdhjIUcIMc3{` zRy%CM*|>n&Gr!^`fEwkMzl!@Or)vRTMTJ5e1;`^sk?3%V!X>-^doy&F;%P^1ml$xq zc&HlpyYC}-reamwo{tf=hPr!;H-lH7G@lHA)v)xB7A(;M5#sH;=#kOL-pd*2(G=?C z`HapdT7$WRz(YEL1QCv?TjMm?O1|7u02RT$L6n@I6tM zj~FLl{T6s}R2Gw<`P9`@614pdnMF-iDDrj{Jyyd=y-T_T^cJIQUa;?Z6o@wyKrFPr zD+Kn{bmr3&ss#B>I(I^&n)Yr|w((R+J~ri>QJQh)oDJ);8GQriqv)!h)G`?@pksek znojB{|FUu_GU6xD1(_UL;3DK?TVui5^-xP++>3c3CVeQHgkrYKInk+*I&5NFPXJTo z!<)xZybQ)KAX?Y1<=gU;?#GcZ2*y9|!!0e(_`q8G>89${B`r$(>80w``PStX09V$= zP5#&b|9bF_g}X6Eu}QfDPLLty*cfgpf~)0fM?%lIoHqZuz6OIcbkuPQ4pFAQ_|Ax7+)B z?ns;bo*^1+l1UsM6`1-X;n%l15I^W@_csW6>UZvkOVB+GfM9n>AdD%UO6kS{&R118 zxFus@P#)tZU(lcS>sLA7*J%@s-lu}pV zL%8--^Pt-Oc5iV1ac^b~Pl{j{g9O%ctGNkf<5~HL$;zVBUU^jPQ@as-+k$ z#&2lJr}ZfwY>TNn-YP~6rvSNf06`x79K7a@c5~OC>JPIb@KF0$V9HUyrEM1V8j)n? zH1?vPsp!gRu$H<7UH?>8x8dPddWA`6wgN9GcmJnpDjEMN zyQ&((Kd!COHMaqdQRM)o6v`36ZU+h@YtR+dAH#B1pdg{q6NgfUSV&3vKA)Bfa*vfS z1ctD^yI)(VrW&a{7y+}ka-$gM{WrC5`a0(62=CFmQ!J`Re@s3f?`_)Z3-C2z@0f@R zvJuYNcqLS+ow%w@5j<*knFMY4iYLZgNLIn5R@Zqr&n15>>-+&BC63tp`(L(zDx`Ti z=3ZwpYaRYr#}&}OqlRn#(WZQj?p~mp=G-Tq z252cjuGQv3d+&>B;-5dnXU5!RCfNl>m+6A3A!}k~?ugk~4>y|SR-sl+r0+6{874T& zUBhg4!l4h&v`yTgTe+6DC%G~L9QHB;OBupT{R5zvSfkp}w*!GYs`fQ&80I@GuY=FZ zR4eK9wcoVcZXr_9Uh4l8yw^G%}9y zElD5b6v@v3mV6L)2d;@G9YrZ>xzYz^g6;~&FFrPojOWRppJ7R0PRwk!h2rjQnGmVA zkl*h!yBU0&Mn7W-4QYep#V}wTBss}P>xqqj_T{UYMtW)$$s^&(GbcAqf;U*^Smd?6 z!k}R{4*gO=3k1l3k7R|HMjwurC;F_~3)1od@&X@J%py{}wVXH?!#r9YM<(4au9=cv zQ=lkFDM3!yH;@W}73oJiJ-V5aA|!k2DmZeRd2xbc2s~)E~Uhu7~wLD|ln83x&b|zBQrdKD$LL!oA0*Nx%jnaH)CQ zA{6POeoVGz;2F{8rH%OC&^rbcGu7hYcY`l4ygnoWn8w>oVup!DjI|)*YuTi5@TK#^ z6UF$dmG6%p9o8%@^{GI^wj0MTC$Wmfg5FWyBAZ0Y_+;Qi(Yb?#?gug$+X1C;%=Sb^ zyrf-ugLK!!4&Af{az(57KxQTln(*ip@Ty4f72rXfarf6|7mlG58r=>Uv=1_ z8o(^NjQ`CAJzZkxB4x0!Dj7IU7w11z{STR|{D+QoG5S9pxrdYg_0Ybr}+<3_-=s67$ zxRA_Dz|}42wWNq--O)hDN{Cr(F(lY2E@5-;3)}Gx(>9A>AUhhKX5lO)`B|)+gz6hf zUA13jshcE&=B~Q0%;EC+l6)Go3y2VivW2Vscl+>%xf784na$EF-=OC-hd&{%E9Okr zujS9>E9y9vJI=Es1+}Y=%cB&j4`UI9U^41DqV0-&Iv`o44)Df`Pqg*^;qfaf^D3>Q zpqyug7sP}Zn??&X87s5HHUQT<0A?uzUcs3aU|nRgj6H@*P2!=P!E@n|>~v0hi6)7r zs%&_f)x9XJH~40Hiv~zDn<7iShZJ8X5%QjrD}RnYg7_26SpzdIH64w(j0&v4X;Q&^ zgC9%oe2v3KkCB4EiSihKbo00=tzwvIUk!jMG(KtwI@jWkko*1Gt@yfF*0pfajmzM} z?2%1Ii^(Y4?VXxAk!cIK7dkhEx+WGc^A;IO?#rws^z1DNq&Hf8Lf3XnRIp3;D?&j% zH$NUZ46qmx5ByXC{TP$ttE3YMrn!439r=1y=7lE25R?kUuT;%1U`b%o6-TKU%udTb zI;M7JSxgB$e+_Rt9Y92EmR;eiMolylGfDi7OfGZ4^daC>{aPTr8FK8h^xY=;onFCb zc4VHj>1}L@I|Y5VE37}tydX%tJ5f{6#3s}F7-v_5<(8VsynNx5!ARZ&KVgynRwNsmeKJ`IkhnN@tS&j= z&Fm-NOghml#rO47^{{i@hq`y!l@>MGC+6%Dk>ApIwBQQA_Qd8|lq4#IQ`!o=s+H8B zU+I2OtvKY-o}rmP@yQdt1mjE+33f}p3kmXU(AV! zA;aL2!wx^0G=xr#EH6Ad+eSG}19#9KGUskb8F`K|rLtyY??TSXFEZ-xf=~D(M-9{Y zz)2#;q8{>8u=dn zmOhJ8XU+nYYAM7XR|{@MHDPuQnF5rIFR!3sWpD(xzDC?nR@-#4JLZA8_qvDD9OblQ zY=kkI1k)yVE}GM6D$$DtvN?TzpgvKXl%^s6P~m)kxz!OJX&z2-$AfkwUwpf~%*BV! zy;M~D2|kYPfsc2Nx|0V};Zxz!p(VGHUUB;Y<}X6Dqq1ZBRtjc`UZ_X#uX(KR)fejq0jl13Y~m@m{;r<00Ts){)e;K`) z>@`$oA^;#p37RNO18x*|^jwxtUcYB6DRaV&l~#oLdbW2=>X@}b7O=}Nw|!qU zd`c~keBE(hE?O3BXJ1y9jrKA5^*aKccC)f&(RSakwl$>o0u1sjo3I8%I#@#x5JKqP3YjU z2N=*bHYU8;$oQf|_ST4!`_gn@`+t}-ur2IoR6>oFM)I6@HG_SaN-4^Zg%=>+_}xsv1?3f>Fmy-oXYja*}YyFU+!+z z_XB+PIbkk?#2mKk-DXo2{w61}bMEl9G-NeFK8u|w6#U!=>cx*j5?Y^rW(baeooC4f-7odFQD|od=JU*eL0DjXFUr!v4zzu<19L|s} z1F+gLWxJ(}p56m4Fnoi0aDJV6I=#il(M}!d6m%?^(N@=?D6aVDakenkg@Mu=%jw%5 z?ai$K&Cq-A@j!0o*9JQg<*f_G>Y`mb4(t@$ehOg>N#%7(`9MJ0IdRQERU<7Cdqh@L z8W@hVP%!f&(8w(2fqjG88MV>!@N}xA3aj_3S3nPcUSDl8{oUuCop094xb~wyPpf** z8^Pi&nx+NJHLN)Hd8alTrhSN}?R)dQ96;CknaN3$J?xoH=j+q#4!BSu&Bf7?RRO%S zTV@qKqt*IT>X_lkHU6y(s$Xp_E(m0!G~o2?>^h|p#v=sDccq?Ie^LS1p+tZtz?|qN z2sN{iB15>syUxtIVh0uH5Odqtoc6JD<9D9LBi~icpD-mT0Gq5dbxq&g+cL49F5v8S zR;!`wXYBpzthf+xKrfeZYDQA(cVzRe*DuMFq1wK}V`-VL_$+~VGO*0q zSss>z>TPhskLS_0n@8`XpIV%rAO2Jpb-$J=`hOd?OG3 zl#rg26yZ1$#|yz!+9~}qVE7z!fP5EO=_bw=T8CV!K7Su3%ywpy4uQ;M+$AO16GN|GB}~ zU$}T=k@qJ_yha5pqS2m}NAhiFXJ?$p;6@E%L;-9qr{+VFi%s%ZEyK%-H4#W!Dk3^f zE&yGkV>H3b@P`cf$j?PKVc^pcH&0JWt;D3m$d?pqZjZQ)OliCUQn>}4L$Gm1ZSISz zL@_2H=lyOD@zn*)MCjt991bhBDAHxNxp?9aG|heSovD{4yAi$ErS;?*8{ z7&k$UqdBSH;l?`E5A)ggb!8Oxc3TcunGvmmOplD&?Cc3y6I6*Z(W=y80eIQsSumOa z!5l>dt@(j`|5V2GFSr|`6?&xH$pxc}@a)uT zs|&%*dkDYnz&nnSasdM|Odra0z`q`Y1(?(xMoWUk z#LP@_(G>s93*88$EqIOW0T;8mgbYd8L`(Y985E zAbWy(_~!GYP*Tz3s4kUTmVmINbii{f&Dht?pm_l@F@4>0vbtT~c!pWvGx0s<05*+& zZ1UU{r95Y#X=z*{80YKTHOZM&_-@2L+}My5$xl0Nv>toU)9ZIMV#yIVCD~0~z*v}F zxT(JxbFvq6~94P9hE?mqF0;UisbQ}`!S0rRXm=k49gl7*tq zF(!WGJ4)s^?+tgN;YD-kc@J-m=dOSzvK=yqsA-b~gqWY@z>{bj&ptoVWr{no)HQ7H zdDOzkH^f+%JF5?2xsK2|F(klXc)4Xsa`w+!9N1W+Vj;~J zvx9XWJvFJQ1-AuTSzcM8MD2S02p{Z>wisc7XZ^xY;O!L=%l#>p(vd<_qrsCQe|oM3 z657L|2=h!t60{B%7~O`ewo&g$YBbcn{4znUk_2l9_L=P)7dh*|Pa8ia8ysmTP2QsV z3@n#Lc3ayie=k9!Lf#2;SAWD)xQ6evC0++mgSP3d2pUKB{W#f`#J8S`+CvgNRRF_) zD~{oAFo#Zn?$)}LQ7HMv_V2yB)np*Ni&Gjy#c}~yk76xp9bxNtc>V6y>Y!1;kHfZC z3j80S+7V&O4f=u*X9km_l6QWS;cwO}Ll~$FYgFC#x*28tX}gn1d(bSI#>x%|l1JCR zG=H!Mia$o#P`b9L5ko*Xgq0ncn~x&FX~`|8U{ilrHmf}2aY_vQ*deX*A1kK8SFBA3 z;3@~~gSa7;L}Hyc8n@lxZdoA&rBRF_j6jieMu(j$hT6FUL|Q4dw^wDvz_DRh!`wqOf!m^Gwbu3D;E{%Lw_!38=cqu{fej6ZL0aHZ+xr{py%H zDq>swo5}N0dUuMi)rvpXcuP9TJ_n`+u0uDQKx^3(?@qW7{|0*#C5lpj2a7g0?zgOa zyBr)rHy*c)(!8mN1H#E`j6*0$@wC2jbj78wT5O-`aw4x79EAqa!4ZX)fRR>fG#s3D+y8HZjR!_#=y;{Hiw(qX!WfFbI(4`Qh06^U@k?G?(lvMg-CAAQ`?ZF7}c-PR_b!9uX+#X_2Uf@K(8#i`b!C|1 z`~p+Qz?qfXS90Iq{7ACBJ#xh%^q_W&-$?O%vVE(gthYI{!tM<26l+DlHtOE{`0jF| zQ?_poW|XWK;<@N;$u>2RMGmHs@9wMb6RG(oEtZS@0ds;NE!=I5)V=4HC+l&mjdvmK zwtr=xtYUnAc9BmR-J1S*nsf=e|0_sB%*-cR!bm|%5(~|4u~uJ_amLzpQ1*h4Iw9t0 znf?YdneX7xp}+>b6vkTxWTS%X9xjr_Lzqch@~%R<>rx$#c`SyJpIr~i87g17P zDNFB-o7*Qh4TOg$eWz(JX<4aU+bNPF1xSdJR70|po@#5K9&gZoquT!Zy!o^hR^?;T zFSz8yH!x3VqF?q_${%}EM(3eHmO;0Wnhot`iDNaZT;MTS9Y(tni|YKrDaN2Aw%GBg zLZ8Ut7Bocs2L)lBHrV1FQKL`XG7!(+IcFXhdk3wz+6CV_Qg#`|F1y2f#654_<5^EA zAG4V0EL-X56k;eQC9kOA@DRZo7(>!lUUuQXhp*~)+~CGi8!DAjv2adi`zA-X=1*(v z-%n9V)gi|%Uvud8rl*egAiS_r}M+LobZC#l0TMp z#b%=qQcRk@OffF@u-Y}jw`Prt=l6my%umk~a5blX9P9Lpt0d>})3U zhj{{>0Zftj%-42RIMWm<92~Q-_6&N%13* z3&X{iKn@*IFt{Vq^^%RWI3~tJ=IqIo^p?SOZUX!}oDaR>yhM}lFu011^vJ*tgBQq| zsim1Il5^R9C33}qGFYW=)p@)wWkU2Cb#)R4tGIEvWC$L!tFbYMp4Y)(PxSn|Bul#N zDXb0+i%UU)?N8ngn@C|Ugs}l|_S;xi833J|t*u~5oifkjJ#C-ss|28|!DoIe)sNGY zX&EKs@B>^fwsZ7~EW|FqChx#DOkASLG6gUsdVy52wlLB*s9K_7Ng~zUVMBg7{dLw` zq@PNpay!T;yjw(0Fc%>qV`6R#!k{sgR4m1Y93bi%B5x&fXlqa}Jz5_CMH}>soT1k& zCxt)fIc0W^Fjv_T8Lc+jnqdGnIYL$R`&KXqx5t;^RP*7VQOVeH{fyue3I1>Mgc70) zQ7uMRnTn`^7$O8ZKFb+?H~5^$omm58T3x)Z3EE$Fu|~Md47O5c7VZte+-+gLPtAv0 zPi?~OoVjMik%w=(IM>XG!KIPgbd{YC%R%N(zp#tGC>)qeJz>27D2*Oqc^dTAbnedy zfH0xk@7P2oDZi5p+tR^1kr~te52y#01#HSdM-^ecqI#(+&hHt}o27`w$+smpu6|9G z(Et9c6@O*c{AdOW%0GEYZNlb7kPbd#YakMZo60e>toS^t5gG>mg38f2t~%ToIvH+c z>+TNEhzO2?wAk_4#O9ZWY0Ayk4LcxlqMt?fYk}sKi?l7JP@Rm}ONB$-Zd}h*D7HXF zNmYrB8(%dwc7|kc*7NA3c&sFQRf@XzEGJhD&^YFb;04mULA|otZ+$*nVh5zF{y;Ac zClmf6sin~}i*W(oT5z2WBnNDU>D3gsGN{CSfE@^ByznQ4-c~@-K|IO*A~+3KY(JP- z$^43z#59r11ScswPk3uYuc`d?G!E~+n9p`VFu_-oInR*iEy%h%Takak7D;P#u;|nv zzDV~_iLm}W-7Ge9kJMdt5vOJ;w8%B<2IAv3MtO$$n*r=LmVF^)4aRktPp%-g_l~M4 zt0b6k;JzxjsJBIx_~lMJNwoM`<(?@Ts=BY1$ws+iUX1;L%X+|X%hy8eJRB`|Qd8A9 zJhP6=ea=G+J(Ebsv7$R*52YH(K+S|*{d7jgJH$5SEh~~zM8x4oVbf^-#pF}4 zW+S2aK^nf6C*eG5cySV53@#p(uXv(fc0sat=8A7gWH!DAzq%|c>_^~V&GgP561Gl( zs7tYUQG6u>t_k2@@0eJaT7K6`DBm3PGi={yGNph=*5=Y-znU0;_r+VEv#cW+?Ik9UQH#&U+1)?L9Z92|UtH6{yQOX2umR1FkT##+>BG#2Bzt^K zMf_){m5L%PCYtM@`jiUJQrQov4|{`2-}rCGH=HuVXW8Q10nsIvTCcVe^Yh#vVigx(|riWn_i73aCu^ z35XXTzMH2h6-J)efh$4EIgjJAQb~~~#x{Rag%Tl_SxlkD_M4cWoT(a7pCWu@;HbQg z!{4hoPH=1-;E5U;U5xUSKSOEdQc8X!2W-iOA^`ilD*3uVsGRDLnDOHMwFY3hpP-+o zS;Bkyk_bDYUwkeNoNSXf$Bws>mbvNdAVCRch0Nj2u8faI;Plj8mx@%SdeC<&wzy&f z8mC#hh3G})EUkEI{`XjdX5<@tLY{VA{xb|KsI-3!M^=Hzwhkqd2%uwal^a-?J_C>) zmkxBYgTt@i`BT`-8*(1W;+l2G+j!m^>&a9uyzhkxgU9oD`*oku>LZ7@PQNI&qw5=C zEH~GK1*|(kFr!~bFELD~|1F`X)nW+)wH(NEJ3}7Ub1ynHzl17Aw=jr{b)oh~LhWYF$1wbK!yz0%8{$K{9w0Jm1QU zw^!t}2#fxQSFf;9<+(lKUY{A(={VK5L9~j=_t_nG1xMF%IDV`x2p~eUuvLOeZGzlo zA5FggS{sXEiTs&UOzWcEXq=V=t29%ygfRFcztQ^|8#OJZ-e38ReCanG6t|H~i`Z&R zWFx5~U|E?&@%JCQL|+1?=HEo&iE1>R-u&oVs1@QDaNPa4hrnjV?)iCsn7&IH5A$6_ zp2$I}aR@^z!gvG8S%F>cUUM-8$Ur>$HNY$`+H&uw0*=yz)+y%oC)|TSL^(^sj}-htZZV;pM14M2otyLFRc&ChuJ&{Kan`5$tbteqNgB*I`1Bq8a5s zon3w>@wUJ`tux&seLY;zHYOk;GT{26r5xe?xPTSJC|T#Ml8szHDb>Ntw9lFZEQ+6mO2L+T~j}>^%0qxZ%g4*a2L%H=3pb$MW zAUEf~RJ|Vlztq@YN`~uS>Pe3qh$Z+hl+)({9t!;nsfAJgmck+bPeTa+AOrkoG(dcy zJ^Bj3Iko?l8{kv_bN2B{1l5E8s+Rs5{eRSe2FM6h!+?+GA5DHS&x!Hh*gr=5Uz*SA zP_Q~3ySfS-kN}?GZwLkOPXbn2zc<$38OC2@`L6^z4QTg!DyXmqCX~t$ str: return result.stdout except subprocess.CalledProcessError as e: - print( + logger.info( f"Nougat command failed with return code {e.returncode}: {e.stderr}" ) raise RuntimeError("Nougat command failed.") from e @@ -214,7 +218,7 @@ def lazy_load(self) -> Iterator[Document]: .replace(r"\[", "$$") .replace(r"\]", "$$") ) - print("content: %s", content) + logger.info("content: %s", content) # extract headings hierarchically headings = extract_headings(content) @@ -230,7 +234,7 @@ def lazy_load(self) -> Iterator[Document]: yield Document(page_content=content, metadata=metadata) # except Exception as e: - # print(f"An error occurred while processing the PDF: {str(e)}") + # logger.info(f"An error occurred while processing the PDF: {str(e)}") def fontsize_mapping(heading_fonts_arr): @@ -357,41 +361,21 @@ def process_pdf(s3, pdf: bytes, **kwargs): and structures the information into a list of dictionaries containing headings and content. Parameters: + s3 (boto3.client): The S3 client to use for downloading the PDF file. pdf (bytes): The PDF file to process. **kwargs: Arbitrary keyword arguments. The function expects 'bucket' and 'key' among the kwargs to specify the S3 bucket and key where the PDF file is located. Returns: - list: A list of dictionaries, each containing 'heading' and 'content' keys. - The 'heading' key maps to a list of dictionaries with keys 'font_size', 'heading', - and 'fontsize_idx'. The 'content' key maps to a string containing the content under - that heading. - [ - { - "heading": [ - { - "font_size": 10, - "heading": "5\n1\n0\n2\ny\na\nM\n8\n1\n", - "fontsize_idx": 2 - } - ], - "content": "xxxx\n" - }, - ... - } - Usage: process_pdf(pdf_bytes, bucket='my-bucket', key='documents/doc.pdf') - - Note: - - The extracted headings and content are dependent on the structure and formatting of the PDF. - - The S3 bucket and key are used to download the file to a local path for processing. + list[Doucment]: A list of Document objects, each representing a semantically grouped section of the PDF file. Each Document object contains a metadata defined in metadata_template, and page_content string with the text content of that section. """ - print("Processing PDF file...") + logger.info("Processing PDF file...") bucket = kwargs['bucket'] key = kwargs['key'] # extract file name also in consideration of file name with blank space local_path = str(os.path.basename(key)) # download to local for futher processing - print(local_path) + logger.info(local_path) s3.download_file(Bucket=bucket, Key=key, Filename=local_path) # TODO, will be deprecated and replaced by nougat class in loader_utils # loader = PDFMinerPDFasHTMLLoader(local_path) @@ -401,11 +385,14 @@ def process_pdf(s3, pdf: bytes, **kwargs): loader = NougatPDFLoader(local_path) data = loader.load() - print("raw data: %s", data) + logger.info("raw data: %s", data) + # Update file_path metadata to full s3 path in list of Document objects + for doc in data[0]: + doc.metadata['file_path'] = f"s3://{bucket}/{key}" markdown_splitter = MarkdownHeaderTextSplitter() md_header_splits = markdown_splitter.split_text(data[0]) for i, doc in enumerate(md_header_splits): - print("PDF file processed successfully, with content of chunk %s: %s", i, doc) + logger.info("PDF file processed successfully, with content of chunk %s: %s", i, doc) return md_header_splits def post_process_pdf(s3, pdf: str): @@ -440,7 +427,7 @@ def post_process_pdf(s3, pdf: str): List[Document] [Document(page_content='this is the content', metadata={'source': '/tmp/tmpghff3i39/xx/dth.txt', 'timestamp': 1697513348.1026106, 'embeddings_model': 'embedding-endpoint'})] """ - print("Post-processing PDF file %s", pdf) + logger.info("Post-processing PDF file %s", pdf) # Parse the input string to a Python data structure input_data = json.loads(pdf) # Create an empty list to hold the Document objects @@ -454,5 +441,5 @@ def post_process_pdf(s3, pdf: str): doc = Document(page_content=page_content, metadata=metadata) documents.append(doc) - print("Post-processing PDF with result %s", documents) + logger.info("Post-processing PDF with result %s", documents) return documents diff --git a/src/scripts/dep/llm_bot_dep/sm_utils.py b/src/scripts/dep/llm_bot_dep/sm_utils.py index cc739122..a60299ae 100644 --- a/src/scripts/dep/llm_bot_dep/sm_utils.py +++ b/src/scripts/dep/llm_bot_dep/sm_utils.py @@ -16,7 +16,7 @@ # extend the SagemakerEndpointEmbeddings class from langchain to provide a custom embedding function class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings): def embed_documents( - self, texts: List[str], chunk_size: int = 5 + self, texts: List[str], chunk_size: int = 500 ) -> List[List[float]]: """Compute doc embeddings using a SageMaker Inference Endpoint. diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index cc51f381..625109b2 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -104,10 +104,6 @@ def aos_injection(content: List[Document], embeddingModelEndpoint: str, aosEndpo Note: """ - # This function includes the following steps: - # 1. split the document into chunks with chunk size to fit the embedding model, note the document is already splited by title/subtitle to form sementic chunks approximately; - # 2. call the embedding model to get the embeddings for each chunk; - # 3. call the AOS to index the chunk with the embeddings; embeddings = sm_utils.create_sagemaker_embeddings_from_js_model(embeddingModelEndpoint, region) def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overlap: int = 30) -> Generator[Document, None, None]: From 1c0eef8ab742e3d98f4b6bf3dd7257f2b02a3b4d Mon Sep 17 00:00:00 2001 From: yike5460 Date: Mon, 6 Nov 2023 14:12:25 +0000 Subject: [PATCH 20/21] feat: add qa enhance along with para adjustment in glue --- src/etl-stack.ts | 42 ++++++++++------ .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 23172 -> 23185 bytes src/scripts/dep/llm_bot_dep/enhance_utils.py | 4 +- src/scripts/dep/llm_bot_dep/loaders/pdf.py | 3 +- src/scripts/glue-job-script.py | 46 +++++++++++++----- 5 files changed, 62 insertions(+), 33 deletions(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 954a6886..545ce6df 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -52,15 +52,34 @@ export class EtlStack extends NestedStack { // Assemble the extra python files list using _S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl') and _S3Bucket.s3UrlForObject('nougat_ocr-0.1.17-py3-none-any.whl') and convert to string const extraPythonFilesList = [_S3Bucket.s3UrlForObject('llm_bot_dep-0.1.0-py3-none-any.whl')].join(','); + const glueRole = new iam.Role(this, 'ETLGlueJobRole', { + assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'), + // the role is used by the glue job to access AOS and by default it has 1 hour session duration which is not enough for the glue job to finish the embedding injection + maxSessionDuration: Duration.hours(12), + }); + glueRole.addToPrincipalPolicy( + new iam.PolicyStatement({ + actions: [ + "sagemaker:InvokeEndpointAsync", + "sagemaker:InvokeEndpoint", + "s3:*", + "es:*", + "glue:*", + "ec2:*", + // cloudwatch logs + "logs:*", + ], + effect: iam.Effect.ALLOW, + resources: ['*'], + }) + ) + // Creata glue job to process files speicified in s3 bucket and prefix const glueJob = new glue.Job(this, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V3_0, pythonVersion: glue.PythonVersion.THREE_NINE, script: glue.Code.fromAsset(path.join(__dirname, 'scripts/glue-job-script.py')), - // s3 location of the python script - // extraPythonFiles: [glue.Code.fromAsset(path.join(__dirname, 'scripts/llm_bot_dep-0.1.0-py3-none-any.whl'))], - // extraPythonFiles: [extraPythonFiles], }), // Worker Type is not supported for Job Command pythonshell and Both workerType and workerCount must be set... // workerType: glue.WorkerType.G_2X, @@ -69,9 +88,11 @@ export class EtlStack extends NestedStack { maxRetries: 1, connections: [connection], maxCapacity: 1, + role: glueRole, defaultArguments: { '--S3_BUCKET.$': sfn.JsonPath.stringAt('$.s3Bucket'), '--S3_PREFIX.$': sfn.JsonPath.stringAt('$.s3Prefix'), + '--QA_ENHANCEMENT.$': sfn.JsonPath.stringAt('$.qaEnhance'), '--AOS_ENDPOINT': props._domainEndpoint, '--REGION': props._region, '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, @@ -82,19 +103,6 @@ export class EtlStack extends NestedStack { } }); - glueJob.role.addToPrincipalPolicy( - new iam.PolicyStatement({ - actions: [ - "sagemaker:InvokeEndpointAsync", - "sagemaker:InvokeEndpoint", - "s3:*", - "es:*", - ], - effect: iam.Effect.ALLOW, - resources: ['*'], - }) - ) - // Create SNS topic and subscription to notify when glue job is completed const topic = new sns.Topic(this, 'etl-topic', { displayName: 'etl-topic', @@ -118,6 +126,7 @@ export class EtlStack extends NestedStack { '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--REGION': props._region, '--OFFLINE': 'true', + '--QA_ENHANCEMENT.$': '$.qaEnhance', }), }); @@ -134,6 +143,7 @@ export class EtlStack extends NestedStack { '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--REGION': props._region, '--OFFLINE': 'false', + '--QA_ENHANCEMENT.$': '$.qaEnhance', }), }); diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 4c55004295474f583974132cc1a71ea4dbf873dd..168d9039093e165034d2d32e66b0e2884d64b032 100644 GIT binary patch delta 5175 zcmZ9Qbxa)G(ubGDt!VMWQnctUPH}hHA`4V-Z}CErvMo^D7Z!JiLUDI@cPsAh(!$ri zH#fQOcaq6W<~%c*KTdup8NVu&s4A4wTy*^3k`}N%l^1?|_yE8pq5|C)M0Kqt19fvMxjgJe2wyWymF3JXI@_K1IGZn4!DemcsLC=?eIlM z@{k{&sc6#aJ@4do-bnWn+`2B4x7U9)#w(SqNF$&P+-pVFNo0>ElOgjx%}<~!qCzx0 zcpH3lEk!vLRT_v9_Ku1`sTz@g`Z4fe3cM$S5$E4CTpa6TFqfriE`6Gxm2;B8U4s)gwFuJeuwNph z)iW9=Ea%ou23?l|pCs&8JEU+dn-K+_8I$KbCV)>UGO|Z2=rP{VCR>6IIyp*N*J>3~ z``S*dR5uPP0&`Y^-$VYYh0qGPult5q{T#X6!*nOvbf77}kjk}uo>RlW*W%5lMnWFJcd}lJ| zQe;6_CLR!_zzaizzIV6Fn8H2;jA?95%R`DL;~ZxCq)ZvEqQ_`5NUnS2)AXEs@Utcl zp>i}!44Fw7sDC{RO4*I&|S=5Er;n`;-3M zp`<96Gc>5tz_uLFjG81cdLDM}%V1)1(dK+VW4PSn?f!Vzlf~X5@#*$Bi{0?Z@bO~h z(l$rVuK5<;{t!l>0Qb0BsS3e7ez@G)48ctMEKZ9bFw;!aU+ zgf8OU-OkpO+Y_!P1AJn=%jq`}8Ddd*1BvZtC@jxA@&P zf*G3=bN^axJM7^j!T>uQ4nIU9%9~GEHtLW_u-vpdcPhq^1B>W2MyE;w!2kx|hAiX3 zH}LT9fOQwe&9BM{Uaj$Dx6~b2assytvGLdDK21AsQ(rOr>a@BW}CKXkT%uJ z(c-s^4VUvKmbrC2=R4pFxhpbL&lA>r-~4Q(3}{Ly=Kw9KaSR9cZf;XzaC9_&YK8?}o7qJL)q>BY5| zyO?H$T8L_ORlJgiZ=8w8L?{+WIlY_2C}5=Q7ROa`3m8rgAmZBULL+t1 zHxhQHGFZb=qX+P^666Ty;);D3ZRTgO(M-kwyh*<+dZ7=OkLN@FWQK+=_}l@FF(s;> z$;|8BP`ug%tl&+SS4w{{<~+3Qmlvt)SI^xrQS2kzYNW0s%odO*3QS^YiUby-nXO3y zfg>LXC>(J7Fg%f5kyE4xXhf3)sN$o@SLjppA>S3w!7_p0c4KCP3(RlU@dnQL3Wc%+ zT&39cWDnp3lhPGX2ccF9c}Ohs#vhrEipuas@4K&oKaR>atJR?L({vA&#S#UV>QXY( z@}b@@cE+Mts=EMUiWJdnPE)Ay4lI}>mod|2Y8b~0q5ID|uq5{>kHI~cX` zBMlE#vc9P5=sQOD+;tqPxEPO5>JYoRI=`H+gKnbJ2R#mHYoyXHJb3~wYEk9@3{A7L zfOHVfRRx2oQo1rn5yhySG?U{shcBK57Hc^dCP}eSkgcZqXh&8Th#S!-ZpPEx zjRg+3U3IeF6p0Id$J z61nbmf<`jR0+cGFwnSH+9~7JRL>EiG>dH0kWV9iyB?!C(7>I9NGM%g5=k_S%iBV$4 z`{ZEdgie=G-(!W8rnvs{ig)m=@!{vFlY!f+>t_bba=4&YJGu9NF`K@fy75Y`#U7+7 zqWmcvUUo_^wjGR&>!6DZsYEqnvFO1TR$kG&SP^mp=?l`B#nRVWuBiygttq0%qHe0u zuns*xs5{8IFW}l8&z*SPqe7=OV#=d@lu(yPqx-eG8WH$L9>GY8&Jb=fx}uwa6AXXJ zp}B!p?c!sF8@h(`v9Ol-_e|Sk6|T9+p8ad0j@o z%9th6J5lA7$2K*P!6;qMo}JVlg$egNKH8OXYz2japTgWEV2(4S+4fr>OvXvv6nZn2 zc?EHtzhPQM4HNwq>v>qV+p8|0E96QSVC&4oBc#P26A=M5&SLN73!p%vB?J6G0Jk3p z0`!D@ns|s1$}87w@awVa@XV<#8J&^Vv4IQLd8l-$Gk-1}qO4?Sbg?yw56<_hKb_;# zV5MdfUlO4S08Y~Ik$gFjl(6vXopFkY_WrwVyyBwlg0V`Yl09p`!^Vp!&8V|+u8&+I z{W@+=P&Un6GT2N|3dZp5Cf&u|iU_~SCr!4`@wI1}&2)jjG}2zz7f-+LBI|be7QoE+?p=QdUiMF&kA;)`;0xcGIue9MXmeUiT&z2`J^C> z-Hkp3ze0}(aRXyD6ZJOTQ4eKcn$eU|e&arQ8%r6k8V?YXqofo6*Og`HH3Vo2Wq*Yja%j22( zE|73v?>A$IDqo5*bNM_RVP> z`ChMc%Qiq%UCBwgXXb6LpH!saOXz8kWp7eE9bYGV3-Rb-fMl=ZuK@~M?+&`_b6oA) zOji%Xa1v>>h*f+)gwgY9mF;N{9MF2pXWblI@_G1$rj6}HM99Nor)K01XBjE{s)l)I z8SU901GUVAIH-WJRA029G${OC(oCZqEjWQjaWbvUC^Y$J^F>F|I?V;JUS&h%zJXgF zu3J;~+x$XsQfj268lwnF`0g_jkx|u6K@^@%6(eR)YOR>;p{=$EvAOkr!p+{jUe++S z)&>!COIf}bItpCc2aDV8B(*4Q+)NW`Tvr!Me8S><6FU1M!>Qe4(u~ z!@yCWRr6v8_hzZ_n{(h_l<0!YqB!9(Dp=x|Q*z(DEc_|b92Zr>r(%M}kIjdOHrqz% zo7QUy+psN_unjZ@*}AGE51F!9DMQT9I`A6z`|nX>1L4{59FW6N9o})@=nosE_(@!% zFS#)*)V<2WIA)x|cY~=PlBTVxU(>1R@kE<%xWwFC8gbRuY|K&&q_>3Yw7lw0n#7S% zipq;KdEPqmO?9}P&7+Fh`lY2*{0CM@R0+Bt$&Zlkqwb=v*Wej0Ox&l*)s)DTLh*H6 z>KC;Z#uISM`Gu;1b7e9vd6sd9Seh1P98&^@=#S2)r^HHUd_lstXg3L8e|ZGCyzF#Z zqBjG~RArmy1=jk=qlXA)n*6$N5xooN4(~RkD%dz*`uPm|N_tTHww`>aM|A2&*Abjh zH0xS(N2~mK$8jShzwq@(qk;*7wdAbA!XHKXj>r)%&+@hJLUQ)Zm8#AQxRzec?y=ZIKcb&WlfPuEj;7y&9^$qB?>8RnJXwVo^=)%~&$V0X5?&wT&LwEG z8<4R$X)2yK!^P@@{PB2vZFfK2JsexkcdKDu+z?kx+Mt)6S3Hb(VO&u>Zn8i=SjY`88n=D!yY?GsFzcFsxuwuK@NsAZvx%Y?F?Q zyhqnrF&-S5UIYYh5)k2B?X5Gd&-wR|vM+1L^J;%a0oQB!7QczuAVXD1ym5WS7@Qj{ zwDA%#@sju{i2*t(7u zuu_vjs0P9U3Wk?w8W5t7x_`!9%p}L)m-LWBla*se8_Ebkbt1gjPeT-r)^oo*DkW{O(iUs)5o!QR1TK9%}Pv<*}I#FZKoRVV@_Xky&^w9Ag3vSfbcx*aGikGtXd3q$@z ztvBzS!U+63-%LJ?c?b#FpQ8G+ZZOKHEh9RQtx1kJW)}3sa5oD64%Wb4OH9uPOr6gB>Y5}pz9MKdMCNWE!uIZ zbuCqbZvw5EQCmY)f_)kgXs)$u*QYsc)^Hhb^}dS5?V_yhkY+Nan3bsV;iHw=oF)M) zo3X)#S=@%mxdhY3t7stvV$YV=?MZYe5 z(O1!i?ygiP0;}D)ZqZ9DDg*m@)3|-KrwT>r^IN*ZiW1f>!}vt07KVnzC;~jq`&R>( zsvgTmESeQOO{`bOebuVv+zS*Y?IXMlvjO9V0nA{up-fmoafd`_A_I2#a$>kmqXuz>Lc zmHrb}z#zca-2Y^M!N5Qkfqz0-JqQXKqE(L^K_`U8uBn0oBt`v?9Q^A~Qv?90{)ohX zPvzH$KQ#)#CiQ=s(CL%V{Z$C!Tmen*Kg|3eWBf1s$5(#+fq?4sg8niYIgq0_e{z3~ z{$Gr}CIYU>!12E`{4IDA*YWYm9}w@4z4%uF4iwR%L5mQA;v;Ac@c!xkk4B;X85RQo Yc>cdIiXjwX4`oEy84zL|Y5%?XKLHD$kpKVy delta 5129 zcmZWtXD}R4xLvE4Aj&GyJJA+FM08PBS>5U^LX?CcSgc;6TSRXgT||kfA<gty-4*_d^wG|98x0suat8VQ|{ zypfaFk_rRoVi!vzJ3Bjza-2{l6}527@s6{NQN+`w%Z@MkTo^s+Snh6OVG1pls0Wc) zEt+zs&3;%A`~3|T#l1WZX`I3e?kLPO7djqqq!@d_6OZ8-rcscQTpM6}@lzTn^BTK0 zc)0pqM{(qfj;3UPBU6{6YB2J`{aZdV^R8TkJEI;3>so)|!$=X4;H=DY_(+^iuJ=OA z#R%t>Ibd+e(to8OfWA)ynkbrptZ`!eXpD_Ug^6rt`I3p{QH`957fYc;sS!huy@n1r z*&76=gtzup{OO!?6Z`43CIPVxcs(-{6!OPw@@@jwd7|rK&D0tnik`-}$44Ra9h}dn zd&(`tlS*G33GAQ1PFolotz}A7H`+w3_~T_@Y9&GQgioyaDC8dr2%fmd;=LlqGLbw0 zlMBC;-AjG!EiTVYd;e0{L9nI7AfZhuRz8s^{u{wgs-tIrH0cF-zsT?t`60tcKs|pt z9@1gxhG=pR%a^m9JPOzJDd=6K>iYub=P2l(9;YxfIlv~Q;_-KOo;q=Lf1`_C#ZW$ANHDn$Gt zb%chD3VdL!vW(-CdVvTfC85GkMkL;WUX1-;xki-Db7Ck6KYg}Z%TleGMACrb7>}be z+U?#Bl0Ropr-)4ZL0TGR&K-Lsi~X**)}lH~L9e9(ZX*NB_bB!~stHFXnI~TIyprno z@ZTWB^ZvG<5+FNgG6m{avw3q523QW$z4PnqBiE>6 zZi3-A{g!;2-}{m^#DaZ}w;m9`vUx*9S`)ZEUbq;z|Lf}Q61#PPtIzK6L}Im!;)lS4 zy;A8^$>ycbXTLu+qLD!l+q`!V&fjVv5yvZHwh1-?9w+NvkKW=uPw2MzWx4_n4@$8M zjyD%Gmg6A`7vm^CQW4_nGhJ-rbuh|;Gzr<|H>?G-{Y`sh1%YI7xs@~MM>x0879j9f zOGjdy@R_~2XiilavDZCba<7k+N_7MVGf(?q66 zz70(7Uf0gpTjBO(k*irDdV>pK)3oij2Q>0w`CXBnK>JYA3ZsMj-XCYD)3_JA)sTsn zds6gnjn>8Bn#szXM(qJE#kgWzO8~Q9j{22}MEMkpu(adYUO`o_IjWdZCci*D&zj!a z^NBk`LdnkZs8|kmi}4%6sYtu#xlg&UC4KlYewLh~%TM!rtNxs?Q*>_LDnrWj$*Aj+ z{OzoVh9(meC9)1R>BqGhQim>~50PA@gqQ#W=GRG%jF-YZi>wmtP7zXiK&r-j>EkwR z0qTUI8Qx+MQWTgM2*Y%%;(6xOjlP5acY;DBo)~f4ALyF&IG(uHAgki{cKWbPqEOYA zECyO=ms9Zih^tH7U8$(w!$orG8(E?!@02fR^8}Q=m;s()t8dF3>Fp=YJCX2%q`sG& zs+52VrTn3XbEXrM7G>`5URKvQXp2UtyW4lPr;^`xg%5_d1Wb|(DrtstvN_&4m^ux` zD-VU=e-&1&&2LV)TFOooRJ2=CRjvwxC2!bA`;+yg%)hL3GQXi1K9Z`+>XbHAzH6?w zLp80^Xz)zT4)h58G}A6RlLU$1!x1`I-D46>2EhurjA+vr8ADQPHGH1bf#40JgQ1@G zDSPP(35bWTMKcLpy%rRKA6QUp5$m6F203SC!P!V}`V&0;Rof27H!W9V2`=8+AX5g0 z1c5uBN4)Y3oo~0*E-gBO(u~$B%dIEMotSH-wku`fWtI%EGzk)c4dgiefbX{KYcbhk z1B#^4MxJpWE9F;dhE)2z{UQ%u)k&pb5(jdw822PK{3#|dc?<5+>X~?iO6q;lVc`Pv z<4Fm;c%1do4oVDPP`!(d6h|4HMVj?2I`2BcAUZyany?gQs$>xP{j!&bUogL!xdnvG z?XUIn+2x`_k~TA>!AJwg0UX+dc1~YiUp;x_^U}!f9;~myyG?yBFONygCo(52q6N#X8- zVTHEBEug4}+y~!|6Ywv2KJ<@*Y{ZZ~y^Gs}xIrf=EfoXtRTrXLo*9)z6>UHXJ%AQ} z?E8-bwa{`hB-aFz6=&NHeUA?Ls$HFgbIo1wNx(AKnYpaxOx*6D#=egx24s7XV?}!R zrkgE^6c?jaqdSLrTxv(8+MnA{sbaw;%6AVM9GEZpt~Rag(NuVARr&4r$CDZ=>O+wB>DCzn~YmyrSZ~4A&y|=jpSrpua5UJIF;uOM`b-d{I{DQMCx4qaTKBMU~bU{z2?}_?$Kz41^grEpnR`vNTz-O!53oq||kz_Krs8IGp12MH&-eV`!mK}f>Vp(e$qe}rtPP@O`1y=Ud3<$<|B0v*mbkls6KX9*`6 zHLj7Hk!X)8oDjIaWJ+Y?Tzf2aafO~Rk^e%N7v%Ez*3?{OP=&ZF>na@+Wjr!ksg!8N zm`+1Uw;oC=HLMTdnqQ}`(*h7zb(~hUJ7&CHr*D`vTWgzL+zhl9b7iO!8Y$!Ffzo2e zBH@7py_8Z$=Xjwy=QLTQvHYG?2A9k;Nouf#K)%ik(be|6P!NV4pi9+uIWCBS}q)%f`D(YG%NExwx|DK zfs)E^tY=bq7?nt8)mXPFxx2ft9>7b^ zfTneA(M~Glh^mjdjX}1x{x*!`3CQWqy9dOTDTP$2$+0@L?omsGZd**&8>v|-qr>|y z^}*MRwrs(}3>s@otKB*R?x8~$sPz!oz^Nkgq9Zi{EQWcRGC}>G4;sK}93paX7 zx)K|zegb0#Uy6CG&Z-b7v_@gfHL=qgxF>{PTzbV^O2|F9nQ=oA)EvM6MKnmx`qrlLWX#{Wq7E&N*_AO`9@!HfM9ic;>0shN-2hpD=RfWoBO-3&&)~ z(`3&Qo`{hZ3%T1$H!tpasFvdg2F1aiG{w%J!)~yO_=toX?~Fp*^~YxC$JMxJ88K$( zm({qZC(m#IqVi@b_TTly*PAk#roriD%lN&Y-+rShkHd46g;?dSjE+tMp3VI`H=wzu+XNgZ~A6bYC3k6&Q6N%X+%^ynSSaF zKbbD2!xJV5o8B2;DccfSXwOec)5UXf$az%_`KL%Lswp+tL(kF zFjqf9$I;!q3QmY;Gwtj6S3UHdHB%I~^wQoI`!jt#e{*K=;p)`t^ybvzF)=I2lb1*` zl&M!uVYJbGjhhj*>Oo|^_;u;+lN*-7`$L3=X*RQoi>8o_kGktCDh3`Wx={mZ1d{oH z5TE;CX?VkvsUz;Xjoc{Et&jK>w_-Dv&|Nkx+4&7!6qHbyrm<+5cgRt%zLCp4DP2jd zME~R+rt()BGU1fPHo0058x>QIAvhrmo>RJ(rX--du@|7{=xBbv-VBX8U>wWx>x zXWWSZ04@Nk9>#_=R9DqERC_apNh{H2BwPtrBsbn{?QPEC#41zuZ%=#!s(7zV&%Jm3lL8zD1Ti=!QTX zG$AWBq`_0T$=kK=TaOcp1+H&JnrxYI;-aPHfmBs3{_(}87nF7CL~rs1<$9?>W3aVg zLz5})d-cFafCA^GnbxjL`{b>yJ3F>g<#u_+HV2sx(!;*0ls_lUqd7Y^sc;T=Ze+dI z)XlP^)9)BSiak*bKYPm8D0QOU)O@a8&PXR-ZsnxObYQto;HT1Hh>cyB`^DAz^)bHi zFK54X8GWuVnF6)35k?t1MwIMvJ@(+=4a)(Wfpy7r>|WlW$GnLKcIur7-LQ-6o{Te( zXSr1?6~vYF8#Vq(i$BbdH~I1_C*dsj8t6w`{I-W9k@doM%hSbM7njthJ*=H68LO;= zp}Y-3L$Y%k>-lKN1YpjeWVQQFJC@$lZwBGz`A{o;a<1%K1QWkn(3 z)b%bY*E0F_Y4!)ffJTfAjY6LpDp7?R-(6jT9aDG5w)bO^p?$QkDH=isf(WNJj-nPt zrbHk}GrtHR;o?SM`V1pE;3ie)vcVtRbS{uvyT@A5&0g$(>o%Gd^ff=>nROwbxyM~t zhvYB_bl{@8AO1^_!olEm=(UynrL;dw&~4k5yDHYnXrWr*iyR4o{WJ+2SsYG} z`z8?JI3tQfmJS6qmBh0}MVhJs1$j|cyzHn=Qz77?FpA#n5%5^#4+Vsh7bQo9m@c-VqKPmBF>s^??>_6>~ zKQ{Gx|KPqJG3Ec%|H%$z7z+@s|2Hd8!7w`ZzcEr=3f{B&WAnE-|I6-Ae-{Qv)xpI7 waVTH)_)`8kqxe&T|BG||hr^CyvtT0q??^%^SYXKtqZv0pAA1)7) List[Dict[str, str]]: + def EnhanceWithClaude(self, prompt: str, solution_title: str, document: Document, zh: bool = False) -> List[Dict[str, str]]: """ Enhance the given prompt using the Claude model by Anthropic. This function constructs a new prompt using the given solution title and page content, sends a request to the Claude model, and retrieves the model's response. Parameters: - - prompt (str): The original prompt to be enhanced. + - prompt (str): The original prompt to be enhanced, not used for now. - solution_title (str): The title of the solution to be included in the new prompt. - page_content (str): The content of the page to be included in the new prompt. - zh (bool): A flag indicating whether to use the Chinese or English prompt template. Default is True (Chinese). diff --git a/src/scripts/dep/llm_bot_dep/loaders/pdf.py b/src/scripts/dep/llm_bot_dep/loaders/pdf.py index 3f3fe597..89c5878f 100644 --- a/src/scripts/dep/llm_bot_dep/loaders/pdf.py +++ b/src/scripts/dep/llm_bot_dep/loaders/pdf.py @@ -387,8 +387,7 @@ def process_pdf(s3, pdf: bytes, **kwargs): data = loader.load() logger.info("raw data: %s", data) # Update file_path metadata to full s3 path in list of Document objects - for doc in data[0]: - doc.metadata['file_path'] = f"s3://{bucket}/{key}" + data[0].metadata['file_path'] = f"s3://{bucket}/{key}" markdown_splitter = MarkdownHeaderTextSplitter() md_header_splits = markdown_splitter.split_text(data[0]) for i, doc in enumerate(md_header_splits): diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index 625109b2..e65a5069 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -22,6 +22,7 @@ from llm_bot_dep import sm_utils from llm_bot_dep.splitter_utils import MarkdownHeaderTextSplitter from llm_bot_dep.loaders.auto import cb_process_object +from llm_bot_dep.enhance_utils import EnhanceWithBedrock from requests_aws4auth import AWS4Auth @@ -36,13 +37,16 @@ os.environ['NLTK_DATA'] = '/tmp/nltk_data' # Parse arguments -args = getResolvedOptions(sys.argv, ['JOB_NAME', 'S3_BUCKET', 'S3_PREFIX', 'AOS_ENDPOINT', 'EMBEDDING_MODEL_ENDPOINT', 'REGION', 'OFFLINE']) +args = getResolvedOptions(sys.argv, ['JOB_NAME', 'S3_BUCKET', 'S3_PREFIX', 'AOS_ENDPOINT', 'EMBEDDING_MODEL_ENDPOINT', 'REGION', 'OFFLINE', 'QA_ENHANCEMENT']) s3_bucket = args['S3_BUCKET'] s3_prefix = args['S3_PREFIX'] aosEndpoint = args['AOS_ENDPOINT'] embeddingModelEndpoint = args['EMBEDDING_MODEL_ENDPOINT'] region = args['REGION'] offline = args['OFFLINE'] +qa_enhancement = args['QA_ENHANCEMENT'] + +ENHANCE_CHUNK_SIZE = 500 credentials = boto3.Session().get_credentials() awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, 'es', session_token=credentials.token) @@ -105,6 +109,16 @@ def aos_injection(content: List[Document], embeddingModelEndpoint: str, aosEndpo Note: """ embeddings = sm_utils.create_sagemaker_embeddings_from_js_model(embeddingModelEndpoint, region) + # TODO, parse the metadata to embed with different index + docsearch = OpenSearchVectorSearch( + index_name=index_name, + embedding_function=embeddings, + opensearch_url="https://{}".format(aosEndpoint), + http_auth = awsauth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection + ) def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overlap: int = 30) -> Generator[Document, None, None]: text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) @@ -124,18 +138,10 @@ def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overla for batch in batches: if len(batch) == 0: continue - logger.info("Adding documents %s to OpenSearch with index %s", batch, index_name) - # TODO, parse the metadata to embed with different index - docsearch = OpenSearchVectorSearch( - index_name=index_name, - embedding_function=embeddings, - opensearch_url="https://{}".format(aosEndpoint), - http_auth = awsauth, - use_ssl = True, - verify_certs = True, - connection_class = RequestsHttpConnection - ) - docsearch.add_documents(documents=batch) + # the batch are still list of Document objects, we need to iterate the list to inject the embeddings, the chunk size (500) should already be small enough to fit the embedding model + for document in batch: + logger.info("Adding documents %s to OpenSearch with index %s", document, index_name) + docsearch.add_documents(documents=document) # main function to be called by Glue job script def main(): @@ -146,6 +152,7 @@ def main(): for file_type, file_content, kwargs in iterate_s3_files(s3_bucket, s3_prefix): try: res = cb_process_object(s3, file_type, file_content, **kwargs) + # TODO, parse the metadata to embed with different index if res: logger.info("Result: %s", res) if file_type == 'csv': @@ -153,6 +160,19 @@ def main(): aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index', gen_chunk=False) elif file_type == 'pdf': aos_injection(res, embeddingModelEndpoint, aosEndpoint, 'chatbot-index') + if qa_enhancement == 'true': + # iterate the document to get the QA pairs + for document in res: + # prompt is not used in this case + prompt = "" + solution_title = "GCR Solution LLM Bot" + ewb = EnhanceWithBedrock(prompt, solution_title, document) + # This is should be optional for the user to choose the chunk size + document_list = ewb.SplitDocumentByTokenNum(document, ENHANCE_CHUNK_SIZE) + # test the function + for document in document_list: + enhanced_prompt = ewb.EnhanceWithClaude(prompt, solution_title, document) + logger.info("Enhanced prompt: {}".format(enhanced_prompt)) except Exception as e: logger.error("Error processing object %s: %s", kwargs['bucket'] + '/' + kwargs['key'], e) From ad8b4357e45f6cb22e4e820f3d356718c71c5e0a Mon Sep 17 00:00:00 2001 From: yike5460 Date: Tue, 7 Nov 2023 05:25:22 +0000 Subject: [PATCH 21/21] chore: add retry to inject aos --- src/etl-stack.ts | 2 +- .../dist/llm_bot_dep-0.1.0-py3-none-any.whl | Bin 23185 -> 23185 bytes src/scripts/dep/llm_bot_dep/loaders/pdf.py | 2 +- src/scripts/glue-job-script.py | 41 +++++++++--------- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/etl-stack.ts b/src/etl-stack.ts index 545ce6df..573dc062 100644 --- a/src/etl-stack.ts +++ b/src/etl-stack.ts @@ -97,7 +97,7 @@ export class EtlStack extends NestedStack { '--REGION': props._region, '--EMBEDDING_MODEL_ENDPOINT': props._embeddingEndpoint, '--DOC_INDEX_TABLE': 'chatbot-index', - '--additional-python-modules': 'langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,openai==0.28.1,nougat-ocr==0.1.17,pyOpenSSL==23.3.0', + '--additional-python-modules': 'langchain==0.0.312,beautifulsoup4==4.12.2,requests-aws4auth==1.2.3,boto3==1.28.69,openai==0.28.1,nougat-ocr==0.1.17,pyOpenSSL==23.3.0,tenacity==8.2.3', // add multiple extra python files '--extra-py-files': extraPythonFilesList } diff --git a/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl b/src/scripts/dep/dist/llm_bot_dep-0.1.0-py3-none-any.whl index 168d9039093e165034d2d32e66b0e2884d64b032..9097dcc17ae52cad446b7e55812dedcb30a8ae1f 100644 GIT binary patch delta 6556 zcmY*;bx_p-)Ai-jeFQKF>Gr zH}meynKQe4X8+rro!yicq?8t<`Wz7g&j4%PC~1kYn-?HZj1mY$k%mP6BCSv40q`9Z zQ*j6~tdq}S7R*zr^QF;Qc5_R6PTPUcg4`=;6&Qw}O^IR|{fu_kEq|3faHy01o$kVa z6PQOxW%}MW1XbMlS#`GiN(}#1ud%vfQT(zQ)zbIQpZYnGf>Eg63~0cJX3DY;GX$4r zcC{U+%2hXFFX&h$SW*K$difrO5zvyqN0!>$tjdzo(LVctH6J!Qnc+uu^6|kh|4a0r zuG0}Tkv)e6+<39Xoh;8C%1 z!NOs-wvhk$YhqCsb%ViH24pBjo5G&9<3N&!hqSHaR(DBIqdi}VD-+5e>?{%Jbvm0jJyJ9OwWq>-1OZTLyv{GZZScjF9Tj`~<*6RoMae2ZELFxXFp9bH# zdW&la__!q=2cS1VuWzR>v#DIK?HKFNCq4+ zpv~?j{)Uc-Hpmkud(4-r5nW2Wtnp4wcts_o>s#bi4DeCo+sl3Y6NI45Au$A+qSlC4VhwZy$Hf(UcC29&U^JBBRumG^V9=)i5Dk zG&5^k*BVdbhCj>FP@JhI*&FFCf)k2=LDke&zSt0Xh0>IM6HUE%8;mZE4N?R-yu9cJ zO=~8qQZ0&!)bYc#z;b@mw_JK-PwYOTwkuB6JHtq{)(4B{5dPA{W~f-pTa?ld3q>VzR1ui*`89hW1UbE36YQ} z4^#ivvkmtE>T4gpUf%Bz!znMv^I3n~Kb%b~nou-4r732|^WIVkLk{E+f#x_=o@9NI za;y=I2t-&S(9aQ#`^xx4Bq%sLKE;B@jBLrmn1qs|WNc*T%n;LZiufBiIRD!v?JKO8 z8uq`)=IqDtH843yN~LrYowcaW#kwPk!UZX#?n3sUtw_=4aWkDnOXXbTOk}QU-@iIu z^UhsJ#U7=GnbjO=O{uH9I6H@`60|!67Z#R#m8B0%0gRGm%|Br+P2jpr(y1^XFz4+b zx;H7!)e%m#Z*1zMP;L{<%!Ao1KNKz|`?I|U#h3g9zG)? zm#>{ZrP0Kb;nOR3_G@i7=kCG6$BLqieGw4|;BEp5emk<}d`;Qv|HQQ(LkEf>rkuyhSWHzC z0T(__Ho)EvXO!_06jvw_Pjgp8H%C1xrE2Ve2qtFcDuv4A;4HMnzZtrMm2|zM=NK@6 zY^x6-8Vh{WmVqu|jXI;AYz&Os{oWb=2C4Uj~iW1te2-L|vEWCHA7lH=;xu`C$OOHGy9zbBEj!X4V`r8*a4R97f@z^{j_XNe|vJ-13f1H5`f% zX9xGzRKjx|(w2y-_+22DOAwTm%RqZXf#t*4-|{j3-5sHD2rA}oTfaiHI}`~2u;zqJ zCvVz4?CLx2k|Sc%FZU_%^hmbNF*!h*23b1Est8n9G2+T@7jniF$uKA-&c;u;I3NhF zlww!^MiI9joz*fVVgrkyYBlkfycgv^x;Jm_QQuzmt zCU2EOjG9fr-aRk;V$qyAL;&p}mm#~M?-75!*WN0^PbSf@kKauq(}{w8I7+x9GmzD4 zR3Sa2Oi8IuU;{@JT%!2TPMAl9l3|o1-)W9!2$YkI#abi}KUA%3iX`2lqBB@-u0T|q zK(ZmrP%pBLFnbQNmG=$&+)U4iW*lkSEML9YKuv8kKxjyXKMaHZEijfmV=dNh)fAtP z`fFg-F>FF0tjUV08-0#GEPK9Ey>#55i88dhVZ-HkC&VB}7S8V_JcMQ1MH4vr=hREr zPY1DG--aT*o}^o}?I%BS z0AfNyb#yo|FQWfx43NeK)PXZS<2!{0)}95)C$R4lra(Fnam`4_k0{E$Tz}EG6HDk^ zt63?$yMKwozwjgbh1qc|e!^bCvWg`XJ|~I$b}+})DaAUVO(fz~-?u zkSL=5RGU9hF4Ho-uBBjTaB{WeFQw8pp4ia1XiWn;ZQF2z)&XYUmmj?bajB?-?Yd4e zc1LJP0Cdaw$gh8fzXukbk7O^~@~7euYRn6--mt76y&CJLO&-)5YDv6cEav{qv1HMT zPE#}Oc(^7Vn&Akd6yY~>Y)5f)H0qldcfX7n3~#%{NAqx2a-G*6IJt$*h{)s1%Pk_Q z{$}-ebHZ`OpnB7orrw|_4v#%086!--scx$O0;G_t$E6!r553hz-}trSt!JC0gU^|Z z$Ag7ZEMXkM1Rh{JP7PKLh^sV-xJ&fct30TAM|LK@KU-5uN@B*ecrVCy?+_qYMZ(Uq zVZ_rBOwo{Ohm(h19hfiYkiud$?(j8Jaeo_o2fU7utuyn(%~+~3%C>kL=`kEcZlAXJco(MMwb3zJH@r!z{hy2&9;EP^^GDg$&MWr!Z5tu1;G*E;PJZbp@ znmA`s_d!M=S~8g*QNgl7SdBIZ8ok#EAm(U2C>rhlVWywJR9*j!L{?&`sc`!}T)W!x zfQ+`hW`5_##K1l8$Y{zzm-A)*Zx(yezlhP(s@>YbvI_l{7k_S-mR&aaZ~3N$Z}5bj zPyD;~2wiJCYaO``c{{xnEiXCJU#NK{;I5qokX5<->@>hQ$~Qv~plRxutSuSn1v+U; z)pTJ_oux}TDTZSVc3CrX1*%yWcgF#XY{*VrXpbUeg|NQJM>2KYo)`FMvjGw_jV?PY z!mcyrbu}@$w&)YdSj8fg?M5swvuU$?7h3%V`)_)jzVGMieN4^N`y0$qa#)KPrFCx9 z@*%;a;f|G>rHN%#CJ`Xn^&v@+2KEDXULS zOE(_o#Z8o&tewXoIUU=s+#yn;gg3XBSGifXnKP@svYus$A?`1;PS(1|rM)r_^M<{? zXt}4~iAzHbJ%#k~GRNiij5rODC@+fDF;I_LhZ>!h=5t}KFG)G=)ypO#1CN;}eVyYq zJdzyX$<%lc|p+*x8v{WUXUhM;H%2gdlKHNhB zbg#ZAuU6J8?QpPmWhL;^)ogfSfz*q=rFue?8&X zpp)XQSB?{oX;@?h8X=*{$N-VIf9WfGMXP{ubBYh)8xWPA=mpz!(Oi&KgB_b!a<*)7 zRVylV{PC&4-KR1GzC}!^oqTp3eE$PdE!w5I2C+f!Z!Su3+2JM({k+mE4o%dI+L&!V zF#NHhkHej6#Lcq=)igSq31y#_)B3gE4iTOnL~hZkpY|mM_@_iPuXJ?Gh`@zTIecerY0imE!x!Ce>{+EgDlq zsaK&;W-kQYLONkFrxpxO4?Wu|jBGaJKPWC>b{}@3&%_x1MoC5J^Cf!!k{OP3^4)kb zjdU+1vt~egl*^3-cB^N+3)_P3zy)Dnq<#fR=N1gD?0N$$$YL) z!5gep;oS8;UO|i%d~{_L^~G|2TYQ1{mTf@i_g9^)thT!1z^bpW_(7?OdU(X6mH7A1 zs)F51Gp1*RlNAD&W~&ti^w)T?mz=Y@!QD5`6}Qb|x!-V-5L~}i;rxm%7dbghrB)tZ zKfT?%MG3x-#Iab1*RfelaqW8;p1Cems@yX)c@jN094OZ?%ywRI^@v}4B0aT}r55t# z@=TB7s$(zk129wikEIOQ_@gKFY}j8d2H>&luy!H{VC2TBV`Mm9`>xBnCbZDs zMpvkw4$2m@|yo zqHnPRE*CD;9@1 zq7`z4e4p3+zW@VT2U{ z`@_aTEKTTCwh-DDi*p%bim)U^H3Okk3LV8)Ab1S0avxr2GV*K_LwAL`p0~eQ;qf&& z;$P=#^JCxR6Pn#qh)Qsu6w$*85ip?zqz9vL_%7XzE z<~i*eaSQKC4IyIV%(xd05Rn;#ib2S9=?5cSV>+q{PGc{=1VZL8^>MozyKC zO4W}wcp6FAC*0Z}s=x{M2`GNgg&sV+XMxvm9_oVw1Fwm?<=9D#a5@oZyIzw$}Aj7Z>wLak7@+g4!$qEaWv{}^dm0MSxfP(U+Xfjz|8Lph!vi( zL-yK8J>RmGW@$}SUs>jL@&UvPvUja4s%r~^r9BjW8{1SJ2Bub#?8eWN%F~|BY4qS% zk&T*}cck$mg|GY5zz>ypc^#hTZs2ddl<{sTI{<`y(uS)mNU@eDG~U5^U7(b6xlBzM&G_AbmdO}*#>xx>>vN^p7F_K z&qnv|qUymlBTDz~zUsl{()9s^Ru&~dbE}E|bRe9p6PR9Fhx}!(Zk4c1AIVZg<$WHa zW%>s8X5Wn%`MEtcU3U093Io;u>MZrlamXU}g$w%>d1WtXvtmG}R?$5!gaH~L&PFJ& zEl=g1KW>+wal7r@w+Zwr89zK;USA8K)tPMnNX1Mvjio3@TDhQkdRp$w>+?DL)rb1a zW)={6bB_eVIWx~6c%ed2<|`ETP{!GBp!&P43Z0&z{#{ce4YhI@I(&dg~4bV>_r$}Zrk#*y1Juy5j^1D6hIDYOKp26dWED+-}*Zk0XgH+a- z@FW_elk#kC_MpPB|IC%_k&azV-E&OQRhh{i@^NAy0=~BY5i5xa(;5ruVc*@{al~@5 zbX(DMS{R#)BA5$ZgA&4#pa}Gc@H1Een(}d)K=$pGxQKMRKhttvE|kM0+`Nx<)kwPin~j?yG1Z5IrcZL7)2Hfm2pp zORnbIXll`02 zMC@$hZx1SPAZ{QIu-C` zmJX9bJmnXd5R`|63DncPuRRs57#19w{^b71`^~}sqH=N51MLI(T9{`VZz@FeC(P(8 z*INzDE*Do%zV65?=G%Vs0lOVj&=y&;2p3S>yHLY81oM9|UZEYeO$kT+D=MOOb!;s0f8{Z~Oehiiq#llazF5P!mOEKsz0QXw z$e$d~yeOttTN;v&790g4iL1jaXw2M4HkYp3PS3Ni@o}z_K|x@A16ZrH>!c2a&2MwdV1jjTEUna_DlAkW0fi#bkP&n%)B*@1VKW> z`W_hm3}n%ITkrCMHR+$Nvv!`>XZvgg=x$MF8r&Ojeh5wfwkuO=j4rml$>?#B@2YFV z>b2~hF2PJ8AX9H}8Mn;)^koH(ZHHPPx>rI=jo!i4W0Zedm*Fb{WUmFy2U~W6**T)! zQ0RJ@uor~XOB<+R;=Hm^&T#UvAg#v83)isWw?coKgPoTv&J&`pE7q5L+749x# zKyF{79W<8QNCLPU6y!f|e~8PZH13J*z6M1N@GZm`IH>M!r3U6ohDZ+@dK>p85kfeS z6b{Qz!CvKxL+bp%JZ9MrVfMl94iUx@oJY6_XtqW$Vs$6yj8M!rA39XiCiXVC!!P6MPJ7X52IDqo}@Kd+6nE`hfQ@hVu_>|3`zZ`QKvH2IQFke=4L&8N2|S{=dWr1FHW4pM7Xg delta 6556 zcmY*eWlS85vR!N!ch|+;U5XTUcZcHc&K55&TWl#_C~ifH+edMVdvR@XC@lW^-Ix24 zH_6G#%$&@>nM`KVTM*J)5E^nt@bD$<4Q+>(;%|unfH)-pfEx**;&uXxtOnh;O}T-WeNF5iUPF5|$O?o*Zq~Wv?-Q2PGuu2%XO57mH}udhsy3J-l_B`QA_Wl(J+rPU z?8lSb5)azd4BIVu1iqHg3xR-k!sCGD{ZC=xWvz(AC8W6snQG^)~TC2y1>zDa) zLjZiTvBy@}oWwA?3u)S~3R~18s>+#1j~PO#98@2Rvt(>nvgu($4ZPOPK|gZ`B>=gn z((}*Xiz??Z^_Ha1a4P~S$&wBcKVOBOdX!;{MdL^MppCK0A0OyO4_;DhkH7{f^!H@d|ZM zNK|dLd>H;kYHPGfTIHpP% zZHLD3=jP`P$WNSOvkL~M zw_%MUzDKh31;^U87}X|4={H@YqiA_>*8z=-L0>UNZws^wup`12s#UFR0ZEbQSbBV^ z3FuJ2Tt(%*QaxEO3dV{h3O5LAjixx@4UHNMIdED8Qv0XB%YYfkDqaG5EiNee6J|f* zp?5jKgKH@YlhcHNGHE0?=Ja2`DT;ZcFl?K516zXS+CakIe(N=j^SF_5AioQzjyqG| zGn~bi%{p7ir?LXSBZJTYz~TS~YRC=t#ck(6s=0%-7oQIpyciWSXI@fIzl z(G=kuaHvJpQU3favga3wp@`-pgnf-Pp67~$J4BrC$6TXr%BRA!#WYyDa$;R{7(uwi z2pTk5D$6M>*jQgj&ZYBQ0?5c{pK7p1I|{%VA?40yHvwt z%|gp-yk2PbIN#p~cr2%SD}`fV^j||Vk>oCuTYjDhyWTySwi3v`sG#_j5@S}-WtdrY z`;7UdT7WvM997P{Y~g}bLKD}OF+Hj=#YNeY&iZ~m+=p_?QY#-l91g1)(>;-;AG2+% zKPIqiTGJ`%rnA7zjq<8e59i;T%)OvvsHrU`>Cqx^V^RKLuXomy4Ncw0tKxxlL0*~C z9>d+)aRMjMKNW(zUoz&VETwz$(37b+*n?1v_{oOxog-cnpMX0cTn;y7mj`V6>^Jop zCQ2W;u}>|J#v(Qm4^S1P$7tSqo6q?lJ!e?ngGIe@PSr)J!mvo~&@j8NdOjpCt$Wsd zC?gaZAXq2svS0ozxUsBB8BGt#sDEm?Rn#cM6b{ypa$6N>$^VgO1HEM{oT1zbSjKaj zap<0$7z0%D=&Fc%$Lk>07 zmls@Os%18?+W?Q$(4saRKhF2P8HK0+(N4D;8FoXZMaLiUh)8zahoM5Mo7aGE6Z!%Z zx34lb`Mjpy$4fqLEXBF6fbSzEfsxeX3J4au$=iCfh`XwoiqR4NI`0^&qFj*mBJ>K% z1&Q16xLxABrmE0IU3+hXZ{0%Ey{CO9wOXRMe*4w=^gTHJfWIch%3Hhhyce5W>G_z3 z*y-4dd=|`pLx{C*;8_ld-He@!9mM!^-kg~tx0a2SwN#k}dh3mRVW47;$qxEtk_!&z zR-tBGLvCK~A!j0$UBh3L2>}Wx4N5KM(`Qh3`#Oir|2Ams^rF7Rz*ZRPXr#@tM~FqN z&pYA%23j!SX=t~xGgEo#cOr|F=DKbmd@Lz%dU1`-eh0*p`PV1mYCQ19{8z3)P@4_QJ@WBadLJi?y*FD{!$NF+nc9#H#L2| z)*fi?tu*xt!{=G=NVVo}T_dYT9|ML;sE(6qEi8;&xHinY$_~4#9)9p{o82TN%^|2D zqAxGb4k^e_baO!n?=4U`j!FI8B3oWv_&$a@zJ7NX)qWUJDlox~2pe4?H?sa+AjB6oz>+#|a?_xi?P4i?=>4Z3g1Y^J}pqeE` zS6frwy2cGNOjWKAlAD8R?7At|sW7|RTjz8WA4-RFT~K-wo;J!{hv3Bd6bZA72(qf} zqfp9Oh=<4&_w`K^lpWGoLS`MD&FM;Xsq0#j{DV#ivc-Z}U@8{UNhR!=Ka87ZpP*zf z;LD@J6E}e*vmNR5i2Nqq28;lE6f@s~pCDu6wl2C96VymVZwwE!ewzmAweSM^#FRBa zs+Dm!K`1MR7};#UM?W{JF9vuxK{l4EEGBKJ0>Yk(9WFS;ZGxomtlm{Td3pwolicgn z_%aI2kpwm{ypIDLy!mhobxZft4W4>o_}){#Xpal|W^^ETS%o~yK0ww?Y%$UtsPIP8 zTs%`d#yPL8EUC45^>^*v`z&`Tm91U%q$FU&qkR`<@j5V2|9BcrAtC{o^WVf9NtWWk zhKzj1D&0;Td{o#^=B=N3$3VtV^x327T(fn_AuUJ-98(tSLUpLDWd7E$X4n-+(j;$X zz9kBoeFY%o`EFp@6K#39BfKnN?+7CX&EuE`^I$z~>+_e$wae;HUJ29*AF7aY*0pbU zR!O&TMZ(`P7uuBi%MopIrZIxgjUBAxks#a!#H`&$ahY}4-+nZnH%wF4QX2_SSrDQs z1$CjTB1h^jGa^9Y#QMGH*Hm{6dUK|gJUao?g{JD<90MvwAp=`_bOmP$BNr+Z1hqqO$NSq$Egv(*}Q@(h2(|APzPkQ1yb;euT5 z1gdlS5Pjs29phzZh{2Sn%V*h_%BfKesa@~!OLm3Y zUCN82$s;&}Ln%LmYy_6ZpjHSujk{|em#aRFj_H6|E2Zm?Y|3fTmvzFy&5G{UFN|(LT8Qzdl9K_Z2klaK;A4I>_|4%Y88#o#mC* zGA$b(%8a_RPD-Vp3;1~?{!??_`=&p2({R*0MpO5zGyB$MZT)74-Gwd8#{&wRyX-rDyxW#q_^h5geBIYsmCJ&(cWgkKBpqWB;-T+-er+gvp&%nu@^mS0!3 z>Z@MMPv4fNILnn^{3e&{@OtIGN*B0lg8AT;CQ}`b^~hUL7tBt*Q+OZv$!_y|qsP`K zZA()E!5Qq2v_wi?LY|!(VmvZDVU=!o&hO7d)&^z-8-*A-8*dOmDhuZQsWR*D$}Tu4 zCCvyU3Zw|NGdLVMMlLzXqi!H9xI$K1X)Z0rXow8_LRKHWRlc4ZRLmT%fuK#UxE3fOgX?a6g^;uQufy1NI+C?E!Q>ZxC#QnD8ibnSDUD2EAnrbjRou)IVa-_bysF&o zu>5g_iHJv{7!idQ!~{9JRdbk68G~mG3H(BBr_Bn+tDIr+5kMHTnbSD|^TsI|x?#XF z9-rUrU+A&Rce65-gLh96?uKZMeEdtBX{WH?J1J6Zem7@8mTe3RfKirWw--9iwf)on zmDG%()6AkKucTu!A4MM)|9Ps-i`E?$19C3Kl~8zTV*!}YBtoF4S+;jLzKaPls}{Mf z%!o=L*#pcz@!R3TliwUzBVu~aEUoJOUO^4&SAkfwH%wkv`T^6ly+$4}&c!**48KwG z)K2}bzG33Lnjak8!mS-3_Dl93kq|LcNrZN7 z5ma-l%abQ19ox4Wf>z4$H(j<5x1;W_zJm_F+FH|f{!NhUEB{&b-U3vN$8rAw*<9zq z+Wh_<#w=Z@qroQKH}WS`gvZ7O^?;!BpE{RLega>}sQXRVbLH$(oDcc=Rn^81PA1Ox z0)6NPl1VftP%U-8=a^0(w(e|K=x5(5+8;B$*v>&s6bt+gO@jkYjuD@{co}4D1uS8y z6a~Z;c2$r*IU#jLV(OGNYhB`!;YfL+wrWpi@Jq*zQB6l{hIys`qyjzOXPi&NRu962 z`q6@-DG9ID$^Z+(23rHIO4`wSzvc)q1=nwsRnVX0pSm|A+8;7!253k&n?he2xSCNv z<)AVouHMhLC^`)X6`JCV-u2o&-`vgC$y{3rpnryZ8k^F9r;*WOD8MpfmdwZq5`m>L z63ZygTk%Z@Cv=+(TiMCXOut!brayl8)^91eM*zpQ>5t(fo>s%;E;BVGUv1hC5Mb97 zWbLgN^-_yhxr?SW3@Ng4Jd6RKQ(Ss*~}U=Zz{Y@3JF3fA~!*c9`)8fi>8}=M_tP zm=G3j@3OfG6wGM0tS`HpSKD=Dgm;Z{EhKN=g<2v|YzBLKK2D+*zQoVxDAYJSPv)%? z&tUVmUM7p&yjGd2_4WlMJEQ>k(iYgyN88Dwjs|6tc z@Oc#zS5~UEfqgU7$q$ymo}c=&n-TNWe|OY%y+xk?JXAxJ9R!EYRIVDjh_GSRGIVbW zkZqrgDsifoBPHAi##(y)Wn#F0}Q-*gjo%q2|6xfPAeW&STcUHMr-=~6xq zP%XN&5b?U6&KB>-n2M#Sp2A;+Da8+FG40alD;=SBe^|8mTj-CiS^|c22*e=wgdU&f zfG6J=_q%5zHC(9*8|tlBFZ>}Ayrbjajm+O5L>(lpcTX8CJTe$^j*yjcCrIxU7YEZA z@TLs@{1?et=HS0YkG>Y zlB>sVLd@)~rF7l%rXxNUlP}8UIPl@+++p>*I_L(D|8?FOrSbaf%a2e9@Rsy-pG~TF z>_^?zq2c{D8&3Vzo#FlF(KZ|bQv=6I@}d;`Y0d*&`@o`Z3HnfM`IS-QE8>V});E!U zm%Q`p@abI^;ENX{Yh1!U4j%gLuU*F7mDpiI_6@xpe!Vb+4&A6)v(!CeBt;n9Uxy)~ zJV9hn3_2&qe%bpoyfX}v-UWX=K0S59wCL>_VkDsKClu5|YB-d7daj3v{@sp#Krp6$ z6uf$d-y#7>cZS5G*t6&~u3drmn#S*2a@I8sapV-ND(&s0Br^cx9#@OeqVAS#zqBOxm zpORSC_Mbv&b!OPw@Fr@00kCpU?2e7=THA)=pNlo0mDiBOd@3_=1uM0h$xvfBqJ|yC zvk!hP-RsZ*JX>M<)f9ZlLZb90cty}(pHm-EI5P@EzGX(3h*}kYf)Eitv;WU>(rA@> zZ)zXRgV;m>fWuKF1U4W~eF^q@r!e-yC&FS!t1X&6b~;KRrCz+?DKMEeZlpr*6WSgm z_8<)H(Re~K@Un+?Yj*#K!?be#!=&0e_%u>kSB}dki2nkeiuTkroVls>ZUPD7x!_PYqrMJ z&WU^pAk8w5AGoOQwYA05=D3S99!;1t?COz}3p^q_C36tQ&!OB}aI*qGe9POZvY}EI}lr%$SnTI!bFVK3>I*o4=ny+xWK5_G?ZFm|#cEn^hho zjO!~u&$vbL3qP6or`3rsC}?s?mA_tZ<@OAN&#r1=-4-Z_W)c-h#G8RDa$s&gEvTY} z*6HEtyIMeTE71>d#d_(%6nrVh(S zyKPB*2inXN*$IUp`+wWHyDaN>k`31w{kncwU3!?)680U)W4M<=fD`9^59rlKG%mA_;rQ}TW z(4e1+I$+(t9{MJ|7>%xb)*~ZCl{AU&p62vx9m|1b4ot?hm%bC0j)ST7IQ~a1c*BZW z>nmS3%dPcjt6CHH8mY~|4}SXPIIM3G2N;}3mcs{ECGQ(1K#ee6KG%*%`#qog6b{uL z4X~7?&<}fhfV#@-R!o@^{;@hXL@}|SpV;}0qrz0M(oyrf<=g-7aAZAEmZ_W zLL@=}0Dubkw?XmCg|x!|daHlI@E8BJ=7@UDf7=xrW=u%)uduj`_sr~X_FvHcSM`^h yBRrrD#ysGEHQ3pMe%OCwn18|g|1{{@|0zaoLWcH#V*x5{LWF4fzX?AR%KriCX>!c~ diff --git a/src/scripts/dep/llm_bot_dep/loaders/pdf.py b/src/scripts/dep/llm_bot_dep/loaders/pdf.py index 89c5878f..a7eb9863 100644 --- a/src/scripts/dep/llm_bot_dep/loaders/pdf.py +++ b/src/scripts/dep/llm_bot_dep/loaders/pdf.py @@ -11,7 +11,7 @@ from langchain.document_loaders import PDFMinerPDFasHTMLLoader from langchain.document_loaders.pdf import BasePDFLoader -from langchain.text_splitter import MarkdownHeaderTextSplitter +# from langchain.text_splitter import MarkdownHeaderTextSplitter logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/src/scripts/glue-job-script.py b/src/scripts/glue-job-script.py index e65a5069..122eb846 100644 --- a/src/scripts/glue-job-script.py +++ b/src/scripts/glue-job-script.py @@ -1,30 +1,24 @@ import os import boto3 import sys -import re import logging -import json import itertools -import uuid -from datetime import datetime from typing import Generator, Any, Dict, Iterable, List, Optional, Tuple -from bs4 import BeautifulSoup import nltk from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.document_loaders import PDFMinerPDFasHTMLLoader, CSVLoader from langchain.docstore.document import Document from langchain.vectorstores import OpenSearchVectorSearch from opensearchpy import RequestsHttpConnection from awsglue.utils import getResolvedOptions from llm_bot_dep import sm_utils -from llm_bot_dep.splitter_utils import MarkdownHeaderTextSplitter from llm_bot_dep.loaders.auto import cb_process_object from llm_bot_dep.enhance_utils import EnhanceWithBedrock from requests_aws4auth import AWS4Auth +from tenacity import retry, stop_after_attempt logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -91,7 +85,6 @@ def batch_generator(generator, batch_size: int): yield batch def aos_injection(content: List[Document], embeddingModelEndpoint: str, aosEndpoint: str, index_name: str, chunk_size: int = 500, gen_chunk: bool = True) -> List[Document]: - """ This function includes the following steps: 1. split the document into chunks with chunk size to fit the embedding model, note the document is already splited by title/subtitle to form sementic chunks approximately; @@ -109,17 +102,6 @@ def aos_injection(content: List[Document], embeddingModelEndpoint: str, aosEndpo Note: """ embeddings = sm_utils.create_sagemaker_embeddings_from_js_model(embeddingModelEndpoint, region) - # TODO, parse the metadata to embed with different index - docsearch = OpenSearchVectorSearch( - index_name=index_name, - embedding_function=embeddings, - opensearch_url="https://{}".format(aosEndpoint), - http_auth = awsauth, - use_ssl = True, - verify_certs = True, - connection_class = RequestsHttpConnection - ) - def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overlap: int = 30) -> Generator[Document, None, None]: text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) for document in content: @@ -140,8 +122,23 @@ def chunk_generator(content: List[Document], chunk_size: int = 500, chunk_overla continue # the batch are still list of Document objects, we need to iterate the list to inject the embeddings, the chunk size (500) should already be small enough to fit the embedding model for document in batch: - logger.info("Adding documents %s to OpenSearch with index %s", document, index_name) - docsearch.add_documents(documents=document) + @retry(stop=stop_after_attempt(3)) + def _aos_injection(document: Document) -> Document: + # TODO, parse the metadata to embed with different index + docsearch = OpenSearchVectorSearch( + index_name=index_name, + embedding_function=embeddings, + opensearch_url="https://{}".format(aosEndpoint), + http_auth = awsauth, + use_ssl = True, + verify_certs = True, + connection_class = RequestsHttpConnection + ) + logger.info("Adding documents %s to OpenSearch with index %s", document, index_name) + docsearch.add_documents(documents=[document]) + logger.info("Retry statistics: %s", _aos_injection.retry.statistics) + # logger.info("Adding documents %s to OpenSearch with index %s", document, index_name) + _aos_injection(document) # main function to be called by Glue job script def main(): @@ -166,6 +163,8 @@ def main(): # prompt is not used in this case prompt = "" solution_title = "GCR Solution LLM Bot" + # Make sure the document is Document object + logger.info("Enhancing document type: {} and content: {}".format(type(document), document)) ewb = EnhanceWithBedrock(prompt, solution_title, document) # This is should be optional for the user to choose the chunk size document_list = ewb.SplitDocumentByTokenNum(document, ENHANCE_CHUNK_SIZE)