Skip to content

Commit

Permalink
Merge branch 'main' into user/dyadav/improve-loaders
Browse files Browse the repository at this point in the history
  • Loading branch information
deshraj authored Nov 16, 2023
2 parents 6e46bdc + 28460f7 commit 68c5cd4
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 28 deletions.
2 changes: 1 addition & 1 deletion embedchain/loaders/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _add_repo_files(repo_path: str):
data.extend(results)
data_urls.extend([result.get("meta_data").get("url") for result in results])
except Exception as e:
logging.error(f"Failed to process {file}: {e}")
logging.warn(f"Failed to process {file}: {e}")

source_hash = hashlib.sha256(repo_url.encode()).hexdigest()
repo_path = f"/tmp/{source_hash}"
Expand Down
13 changes: 7 additions & 6 deletions embedchain/loaders/unstructured_file.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
import hashlib

try:
from langchain.document_loaders import UnstructuredFileLoader
except ImportError:
raise ImportError(
'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
) from None
from embedchain.helper.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import clean_string
Expand All @@ -15,6 +9,13 @@
class UnstructuredLoader(BaseLoader):
def load_data(self, url):
"""Load data from a Unstructured file."""
try:
from langchain.document_loaders import UnstructuredFileLoader
except ImportError:
raise ImportError(
'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' # noqa: E501
) from None

loader = UnstructuredFileLoader(url)
data = []
all_content = []
Expand Down
32 changes: 16 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "embedchain"
version = "0.1.15"
version = "0.1.16"
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
authors = [
"Taranjeet Singh <[email protected]>",
Expand Down Expand Up @@ -120,7 +120,7 @@ weaviate-client = { version = "^3.24.1", optional = true }
docx2txt = { version = "^0.8", optional = true }
pinecone-client = { version = "^2.2.4", optional = true }
qdrant-client = { version = "1.6.3", optional = true }
unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true}
unstructured = {extras = ["local-inference", "all-docs"], version = "^0.10.18", optional = true}
pillow = { version = "10.0.1", optional = true }
torchvision = { version = ">=0.15.1, !=0.15.2", optional = true }
ftfy = { version = "6.1.1", optional = true }
Expand Down Expand Up @@ -169,7 +169,7 @@ huggingface_hub=["huggingface_hub"]
cohere = ["cohere"]
milvus = ["pymilvus"]
dataloaders=[
"youtube-transcripts-api",
"youtube-transcript-api",
"beautifulsoup4",
"docx2txt",
"duckduckgo-search",
Expand All @@ -193,9 +193,9 @@ json = ["llama-hub"]
postgres = ["psycopg", "psycopg-binary", "psycopg-pool"]
mysql = ["mysql-connector-python"]
git = ["gitpython"]
youtube_channel = [
youtube = [
"yt_dlp",
"youtube-transcripts-api",
"youtube-transcript-api",
]

[tool.poetry.group.docs.dependencies]
Expand Down

0 comments on commit 68c5cd4

Please sign in to comment.