From c93e49d2b82b20298507e017ebe4e5c834e2f2f2 Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Wed, 15 Nov 2023 19:35:30 -0800 Subject: [PATCH 1/2] [Bug fix] Update sleep time for substack loader and version bump (#958) --- embedchain/loaders/substack.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py index 92cdb1e8a7..4dcc609eeb 100644 --- a/embedchain/loaders/substack.py +++ b/embedchain/loaders/substack.py @@ -81,6 +81,6 @@ def load_link(link: str): if data: output.append({"content": data, "meta_data": {"url": link}}) # TODO: allow users to configure this - time.sleep(0.4) # added to avoid rate limiting + time.sleep(1.0) # added to avoid rate limiting return {"doc_id": doc_id, "data": output} diff --git a/pyproject.toml b/pyproject.toml index f285ef4444..80380a5fa3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.14" +version = "0.1.15" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ", From 28460f725cd43a0e239d59123849e8b21c839602 Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Thu, 16 Nov 2023 13:30:38 -0800 Subject: [PATCH 2/2] [Bugfix] fix poetry lock (#960) --- embedchain/loaders/github.py | 2 +- embedchain/loaders/unstructured_file.py | 13 +++++----- poetry.lock | 32 ++++++++++++------------- pyproject.toml | 10 ++++---- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/embedchain/loaders/github.py b/embedchain/loaders/github.py index 6b48f0d1ee..a1d990d609 100644 --- a/embedchain/loaders/github.py +++ b/embedchain/loaders/github.py @@ -68,7 +68,7 @@ def _add_repo_files(repo_path: str): data.extend(results) data_urls.extend([result.get("meta_data").get("url") for result in results]) except Exception as e: - logging.error(f"Failed to process {file}: {e}") + logging.warn(f"Failed to process {file}: {e}") source_hash = hashlib.sha256(repo_url.encode()).hexdigest() repo_path = f"/tmp/{source_hash}" diff --git a/embedchain/loaders/unstructured_file.py b/embedchain/loaders/unstructured_file.py index 9b491c4a6c..be8cd931fc 100644 --- a/embedchain/loaders/unstructured_file.py +++ b/embedchain/loaders/unstructured_file.py @@ -1,11 +1,5 @@ import hashlib -try: - from langchain.document_loaders import UnstructuredFileLoader -except ImportError: - raise ImportError( - 'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' - ) from None from embedchain.helper.json_serializable import register_deserializable from embedchain.loaders.base_loader import BaseLoader from embedchain.utils import clean_string @@ -15,6 +9,13 @@ class UnstructuredLoader(BaseLoader): def load_data(self, url): """Load data from a Unstructured file.""" + try: + from langchain.document_loaders import UnstructuredFileLoader + except ImportError: + raise ImportError( + 'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' # noqa: E501 + ) from None + loader = UnstructuredFileLoader(url) data = [] all_content = [] diff --git a/poetry.lock b/poetry.lock index dd8b38dab0..2f2f26d0ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6892,30 +6892,30 @@ backoff = "*" beautifulsoup4 = "*" chardet = "*" dataclasses-json = "*" -ebooklib = {version = "*", optional = true, markers = "extra == \"local-inference\""} +ebooklib = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} emoji = "*" filetype = "*" langdetect = "*" lxml = "*" -markdown = {version = "*", optional = true, markers = "extra == \"local-inference\""} -msg-parser = {version = "*", optional = true, markers = "extra == \"local-inference\""} +markdown = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +msg-parser = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} nltk = "*" numpy = "*" -openpyxl = {version = "*", optional = true, markers = "extra == \"local-inference\""} -pandas = {version = "*", optional = true, markers = "extra == \"local-inference\""} -pdf2image = {version = "*", optional = true, markers = "extra == \"local-inference\""} -"pdfminer.six" = {version = "*", optional = true, markers = "extra == \"local-inference\""} -pypandoc = {version = "*", optional = true, markers = "extra == \"local-inference\""} -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"local-inference\""} +openpyxl = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +pandas = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +pdf2image = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +"pdfminer.six" = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +pypandoc = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"local-inference\""} +python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} rapidfuzz = "*" requests = "*" tabulate = "*" -unstructured-inference = {version = "0.7.3", optional = true, markers = "extra == \"local-inference\""} -"unstructured.pytesseract" = {version = ">=0.3.12", optional = true, markers = "extra == \"local-inference\""} -xlrd = {version = "*", optional = true, markers = "extra == \"local-inference\""} +unstructured-inference = {version = "0.7.3", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +"unstructured.pytesseract" = {version = ">=0.3.12", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} +xlrd = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""} [package.extras] airtable = ["pyairtable"] @@ -7641,7 +7641,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] cohere = ["cohere"] community = ["llama-hub"] -dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured"] +dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"] discord = ["discord"] elasticsearch = ["elasticsearch"] git = ["gitpython"] @@ -7663,9 +7663,9 @@ streamlit = [] vertexai = ["google-cloud-aiplatform"] weaviate = ["weaviate-client"] whatsapp = ["flask", "twilio"] -youtube-channel = ["yt_dlp"] +youtube = ["youtube-transcript-api", "yt_dlp"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "a7282080c7a4379bdc6f33dfe9cae7eb20764aae0176137ba5c7af7cdcc58ede" +content-hash = "58dce9bc5ef9c8d7e77d5fbc3176e24a8facbe89beddbf4c605b9c68e6617b5a" diff --git a/pyproject.toml b/pyproject.toml index 80380a5fa3..556800f3a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.15" +version = "0.1.16" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" authors = [ "Taranjeet Singh ", @@ -120,7 +120,7 @@ weaviate-client = { version = "^3.24.1", optional = true } docx2txt = { version = "^0.8", optional = true } pinecone-client = { version = "^2.2.4", optional = true } qdrant-client = { version = "1.6.3", optional = true } -unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true} +unstructured = {extras = ["local-inference", "all-docs"], version = "^0.10.18", optional = true} pillow = { version = "10.0.1", optional = true } torchvision = { version = ">=0.15.1, !=0.15.2", optional = true } ftfy = { version = "6.1.1", optional = true } @@ -169,7 +169,7 @@ huggingface_hub=["huggingface_hub"] cohere = ["cohere"] milvus = ["pymilvus"] dataloaders=[ - "youtube-transcripts-api", + "youtube-transcript-api", "beautifulsoup4", "docx2txt", "duckduckgo-search", @@ -193,9 +193,9 @@ json = ["llama-hub"] postgres = ["psycopg", "psycopg-binary", "psycopg-pool"] mysql = ["mysql-connector-python"] git = ["gitpython"] -youtube_channel = [ +youtube = [ "yt_dlp", - "youtube-transcripts-api", + "youtube-transcript-api", ] [tool.poetry.group.docs.dependencies]