From c93e49d2b82b20298507e017ebe4e5c834e2f2f2 Mon Sep 17 00:00:00 2001
From: Deshraj Yadav <deshrajdry@gmail.com>
Date: Wed, 15 Nov 2023 19:35:30 -0800
Subject: [PATCH 1/2] [Bug fix] Update sleep time for substack loader and
 version bump (#958)

---
 embedchain/loaders/substack.py | 2 +-
 pyproject.toml                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/embedchain/loaders/substack.py b/embedchain/loaders/substack.py
index 92cdb1e8a7..4dcc609eeb 100644
--- a/embedchain/loaders/substack.py
+++ b/embedchain/loaders/substack.py
@@ -81,6 +81,6 @@ def load_link(link: str):
             if data:
                 output.append({"content": data, "meta_data": {"url": link}})
             # TODO: allow users to configure this
-            time.sleep(0.4)  # added to avoid rate limiting
+            time.sleep(1.0)  # added to avoid rate limiting
 
         return {"doc_id": doc_id, "data": output}
diff --git a/pyproject.toml b/pyproject.toml
index f285ef4444..80380a5fa3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.14"
+version = "0.1.15"
 description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",

From 28460f725cd43a0e239d59123849e8b21c839602 Mon Sep 17 00:00:00 2001
From: Deven Patel <iamdevenpatel@gmail.com>
Date: Thu, 16 Nov 2023 13:30:38 -0800
Subject: [PATCH 2/2] [Bugfix] fix poetry lock (#960)

---
 embedchain/loaders/github.py            |  2 +-
 embedchain/loaders/unstructured_file.py | 13 +++++-----
 poetry.lock                             | 32 ++++++++++++-------------
 pyproject.toml                          | 10 ++++----
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/embedchain/loaders/github.py b/embedchain/loaders/github.py
index 6b48f0d1ee..a1d990d609 100644
--- a/embedchain/loaders/github.py
+++ b/embedchain/loaders/github.py
@@ -68,7 +68,7 @@ def _add_repo_files(repo_path: str):
                             data.extend(results)
                             data_urls.extend([result.get("meta_data").get("url") for result in results])
                     except Exception as e:
-                        logging.error(f"Failed to process {file}: {e}")
+                        logging.warn(f"Failed to process {file}: {e}")
 
         source_hash = hashlib.sha256(repo_url.encode()).hexdigest()
         repo_path = f"/tmp/{source_hash}"
diff --git a/embedchain/loaders/unstructured_file.py b/embedchain/loaders/unstructured_file.py
index 9b491c4a6c..be8cd931fc 100644
--- a/embedchain/loaders/unstructured_file.py
+++ b/embedchain/loaders/unstructured_file.py
@@ -1,11 +1,5 @@
 import hashlib
 
-try:
-    from langchain.document_loaders import UnstructuredFileLoader
-except ImportError:
-    raise ImportError(
-        'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
-    ) from None
 from embedchain.helper.json_serializable import register_deserializable
 from embedchain.loaders.base_loader import BaseLoader
 from embedchain.utils import clean_string
@@ -15,6 +9,13 @@
 class UnstructuredLoader(BaseLoader):
     def load_data(self, url):
         """Load data from a Unstructured file."""
+        try:
+            from langchain.document_loaders import UnstructuredFileLoader
+        except ImportError:
+            raise ImportError(
+                'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'  # noqa: E501
+            ) from None
+
         loader = UnstructuredFileLoader(url)
         data = []
         all_content = []
diff --git a/poetry.lock b/poetry.lock
index dd8b38dab0..2f2f26d0ce 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6892,30 +6892,30 @@ backoff = "*"
 beautifulsoup4 = "*"
 chardet = "*"
 dataclasses-json = "*"
-ebooklib = {version = "*", optional = true, markers = "extra == \"local-inference\""}
+ebooklib = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
 emoji = "*"
 filetype = "*"
 langdetect = "*"
 lxml = "*"
-markdown = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-msg-parser = {version = "*", optional = true, markers = "extra == \"local-inference\""}
+markdown = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+msg-parser = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
 nltk = "*"
 numpy = "*"
-openpyxl = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-pandas = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-pdf2image = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-"pdfminer.six" = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-pypandoc = {version = "*", optional = true, markers = "extra == \"local-inference\""}
-python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"local-inference\""}
+openpyxl = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+pandas = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+pdf2image = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+"pdfminer.six" = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+pypandoc = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
 python-iso639 = "*"
 python-magic = "*"
-python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"local-inference\""}
+python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
 rapidfuzz = "*"
 requests = "*"
 tabulate = "*"
-unstructured-inference = {version = "0.7.3", optional = true, markers = "extra == \"local-inference\""}
-"unstructured.pytesseract" = {version = ">=0.3.12", optional = true, markers = "extra == \"local-inference\""}
-xlrd = {version = "*", optional = true, markers = "extra == \"local-inference\""}
+unstructured-inference = {version = "0.7.3", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+"unstructured.pytesseract" = {version = ">=0.3.12", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
+xlrd = {version = "*", optional = true, markers = "extra == \"all-docs\" or extra == \"local-inference\""}
 
 [package.extras]
 airtable = ["pyairtable"]
@@ -7641,7 +7641,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [extras]
 cohere = ["cohere"]
 community = ["llama-hub"]
-dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured"]
+dataloaders = ["beautifulsoup4", "docx2txt", "duckduckgo-search", "pypdf", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"]
 discord = ["discord"]
 elasticsearch = ["elasticsearch"]
 git = ["gitpython"]
@@ -7663,9 +7663,9 @@ streamlit = []
 vertexai = ["google-cloud-aiplatform"]
 weaviate = ["weaviate-client"]
 whatsapp = ["flask", "twilio"]
-youtube-channel = ["yt_dlp"]
+youtube = ["youtube-transcript-api", "yt_dlp"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "a7282080c7a4379bdc6f33dfe9cae7eb20764aae0176137ba5c7af7cdcc58ede"
+content-hash = "58dce9bc5ef9c8d7e77d5fbc3176e24a8facbe89beddbf4c605b9c68e6617b5a"
diff --git a/pyproject.toml b/pyproject.toml
index 80380a5fa3..556800f3a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.15"
+version = "0.1.16"
 description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",
@@ -120,7 +120,7 @@ weaviate-client = { version = "^3.24.1", optional = true }
 docx2txt = { version = "^0.8", optional = true }
 pinecone-client = { version = "^2.2.4", optional = true }
 qdrant-client = { version = "1.6.3", optional = true }
-unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true}
+unstructured = {extras = ["local-inference", "all-docs"], version = "^0.10.18", optional = true}
 pillow = { version = "10.0.1", optional = true }
 torchvision = { version = ">=0.15.1, !=0.15.2", optional = true }
 ftfy = { version = "6.1.1", optional = true }
@@ -169,7 +169,7 @@ huggingface_hub=["huggingface_hub"]
 cohere = ["cohere"]
 milvus = ["pymilvus"]
 dataloaders=[
-    "youtube-transcripts-api",
+    "youtube-transcript-api",
     "beautifulsoup4",
     "docx2txt",
     "duckduckgo-search",
@@ -193,9 +193,9 @@ json = ["llama-hub"]
 postgres = ["psycopg", "psycopg-binary", "psycopg-pool"]
 mysql = ["mysql-connector-python"]
 git = ["gitpython"]
-youtube_channel = [
+youtube = [
     "yt_dlp",
-    "youtube-transcripts-api",
+    "youtube-transcript-api",
 ]
 
 [tool.poetry.group.docs.dependencies]