-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Enable URL as context * Formatting update
- Loading branch information
1 parent
6e1325c
commit 469b50d
Showing
6 changed files
with
130 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
""" Document Parser """ | ||
from bs4 import BeautifulSoup | ||
from langchain.retrievers.multi_query import MultiQueryRetriever | ||
from langchain_community.document_loaders.chromium import AsyncChromiumLoader | ||
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings | ||
from langchain_community.vectorstores.chroma import Chroma | ||
|
||
class DocReader(): | ||
""" Document Reader """ | ||
docs_retriever = None | ||
t_docs = None | ||
|
||
def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''): | ||
self.embed_model = HuggingFaceEmbeddings(model_name=st_model) | ||
self.url = url | ||
self.loader = AsyncChromiumLoader([url]) | ||
self.model = main_model | ||
|
||
def create_db(self, docs): | ||
""" Create a verctor database from docs """ | ||
vector_store = Chroma.from_documents( | ||
documents=docs, | ||
embedding=self.embed_model, | ||
) | ||
|
||
self.docs_retriever = MultiQueryRetriever.from_llm( | ||
llm=self.model, | ||
retriever=vector_store.as_retriever(), | ||
) | ||
|
||
# Experimental | ||
def extract_links(self, html): | ||
""" Extract links from this webpage """ | ||
soup = BeautifulSoup(html, 'html.parser') | ||
links = [link.get('href') for link in soup.find_all('a')] | ||
return links | ||
|
||
async def crawl_and_load(self, url, visited=None): | ||
""" Crawl and load contents of the URL """ | ||
if visited is None: | ||
visited = set() | ||
|
||
visited.add(url) | ||
html_docs = await self.loader.aload() | ||
#content = html_docs[0].page_content | ||
#links = self.extract_links(html_docs) | ||
|
||
return html_docs |
This file was deleted.
Oops, something went wrong.