From 41ac0ba0eb2289dda42defcbe06218033a467934 Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Wed, 15 Nov 2023 19:19:37 -0800 Subject: [PATCH 1/3] Update sleep time for substack loader and version bump --- embedchain/loaders/substack.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py index 92cdb1e8a7..4dcc609eeb 100644 --- a/embedchain/loaders/substack.py +++ b/embedchain/loaders/substack.py @@ -81,6 +81,6 @@ def load_link(link: str): if data: output.append({"content": data, "meta_data": {"url": link}}) # TODO: allow users to configure this - time.sleep(0.4) # added to avoid rate limiting + time.sleep(1.0) # added to avoid rate limiting return {"doc_id": doc_id, "data": output} diff --git a/pyproject.toml b/pyproject.toml index f285ef4444..80380a5fa3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.14" +version = "0.1.15" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ", From 26001a07078ca074891eb415c9cc67409fec4335 Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Thu, 16 Nov 2023 15:20:51 -0800 Subject: [PATCH 2/3] [Loaders] Improve web page and sitemap loader usability --- embedchain/loaders/sitemap.py | 3 ++- embedchain/loaders/web_page.py | 48 ++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/embedchain/loaders/sitemap.py b/embedchain/loaders/sitemap.py index fa8bbe50ab..707c891dc4 100644 --- a/embedchain/loaders/sitemap.py +++ b/embedchain/loaders/sitemap.py @@ -3,6 +3,7 @@ import logging import requests +from tqdm import tqdm try: from bs4 import BeautifulSoup @@ -52,7 +53,7 @@ def load_link(link): with concurrent.futures.ThreadPoolExecutor() as executor: future_to_link = {executor.submit(load_link, link): link for link in links} - for future in concurrent.futures.as_completed(future_to_link): + for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)): link = future_to_link[future] try: data = future.result() diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py index 98109d0716..bd2575f7b2 100644 --- a/embedchain/loaders/web_page.py +++ b/embedchain/loaders/web_page.py @@ -17,26 +17,32 @@ @register_deserializable class WebPageLoader(BaseLoader): + # Shared session for all instances + _session = requests.Session() + def load_data(self, url): - """Load data from a web page.""" - response = requests.get(url) - data = response.content - content = self._get_clean_content(data, url) - - meta_data = { - "url": url, - } - - doc_id = hashlib.sha256((content + url).encode()).hexdigest() - return { - "doc_id": doc_id, - "data": [ - { - "content": content, - "meta_data": meta_data, - } - ], - } + """Load data from a web page using a shared requests session.""" + try: + response = self._session.get(url, timeout=30) + response.raise_for_status() + data = response.content + content = self._get_clean_content(data, url) + + meta_data = {"url": url} + + doc_id = hashlib.sha256((content + url).encode()).hexdigest() + return { + "doc_id": doc_id, + "data": [ + { + "content": content, + "meta_data": meta_data, + } + ], + } + except requests.RequestException as e: + logging.error(f"Error fetching URL {url}: {e}") + return None def _get_clean_content(self, html, url) -> str: soup = BeautifulSoup(html, "html.parser") @@ -86,3 +92,7 @@ def _get_clean_content(self, html, url) -> str: ) return content + + @classmethod + def close_session(cls): + cls._session.close() From 6e46bdc1813c5515a56e2888b7cfff57c10f5b1f Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Thu, 16 Nov 2023 15:42:10 -0800 Subject: [PATCH 3/3] Fix test --- embedchain/loaders/web_page.py | 38 +++++++++++++++------------------- tests/loaders/test_web_page.py | 2 +- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py index bd2575f7b2..931031826f 100644 --- a/embedchain/loaders/web_page.py +++ b/embedchain/loaders/web_page.py @@ -22,27 +22,23 @@ class WebPageLoader(BaseLoader): def load_data(self, url): """Load data from a web page using a shared requests session.""" - try: - response = self._session.get(url, timeout=30) - response.raise_for_status() - data = response.content - content = self._get_clean_content(data, url) - - meta_data = {"url": url} - - doc_id = hashlib.sha256((content + url).encode()).hexdigest() - return { - "doc_id": doc_id, - "data": [ - { - "content": content, - "meta_data": meta_data, - } - ], - } - except requests.RequestException as e: - logging.error(f"Error fetching URL {url}: {e}") - return None + response = self._session.get(url, timeout=30) + response.raise_for_status() + data = response.content + content = self._get_clean_content(data, url) + + meta_data = {"url": url} + + doc_id = hashlib.sha256((content + url).encode()).hexdigest() + return { + "doc_id": doc_id, + "data": [ + { + "content": content, + "meta_data": meta_data, + } + ], + } def _get_clean_content(self, html, url) -> str: soup = BeautifulSoup(html, "html.parser") diff --git a/tests/loaders/test_web_page.py b/tests/loaders/test_web_page.py index cdaf09447c..3134d4d0c1 100644 --- a/tests/loaders/test_web_page.py +++ b/tests/loaders/test_web_page.py @@ -27,7 +27,7 @@ def test_load_data(web_page_loader): """ - with patch("embedchain.loaders.web_page.requests.get", return_value=mock_response): + with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response): result = web_page_loader.load_data(page_url) content = web_page_loader._get_clean_content(mock_response.content, page_url)