From 085957e56c75133c5705199f38195b6dd6c1c09b Mon Sep 17 00:00:00 2001 From: Viraj Malia Date: Mon, 26 Aug 2024 16:56:44 -0700 Subject: [PATCH 1/2] Perform cleanup of webpage elements in vectordb --- app/server.py | 12 ++-- app/src/parsers/documents.py | 130 ++++++++++++++++++++++++++--------- 2 files changed, 103 insertions(+), 39 deletions(-) diff --git a/app/server.py b/app/server.py index 0f59471..71e66b9 100644 --- a/app/server.py +++ b/app/server.py @@ -6,7 +6,6 @@ from langchain.pydantic_v1 import Field from langchain_core.runnables import RunnableLambda from langchain_core.messages import HumanMessage, AIMessage -from langchain_community.document_transformers.html2text import Html2TextTransformer from langserve import CustomUserType from langserve.server import add_routes from app.src.llama4u import Llama4U @@ -35,14 +34,12 @@ class ChatHistory(CustomUserType): async def get_response_from_docs(url): """ Get a response from a URL page content """ - doc_reader = DocReader(llama4u.llm, url=url) - crawled_data = await doc_reader.crawl_and_load(url) + doc_reader = DocReader(llama4u.llm, base_url=url) + docs = await doc_reader.process_documentation() - h2t = Html2TextTransformer() - t_docs = h2t.transform_documents(crawled_data) - doc_reader.create_db(t_docs) + doc_reader.create_db(docs) return doc_reader.docs_retriever.invoke( # type: ignore - input='Read each word carefully and \ + input='Read carefully and \ find the relevance with programming and APIs. \ Summarize the document such that it can be used \ as a context for future conversations.') @@ -88,7 +85,6 @@ async def format_input(input_data: ChatHistory, config): app, chat_model.with_types( input_type=ChatHistory, - output_type=ChatHistory, ).with_config( configurable={"doc_url": "doc_url"}), config_keys=["configurable", "doc_url"], diff --git a/app/src/parsers/documents.py b/app/src/parsers/documents.py index 59b10e6..a3e3126 100644 --- a/app/src/parsers/documents.py +++ b/app/src/parsers/documents.py @@ -1,48 +1,116 @@ -""" Document Parser """ +import os +import asyncio +import uuid +import logging +from urllib.parse import urljoin from bs4 import BeautifulSoup -from langchain.retrievers.multi_query import MultiQueryRetriever -from langchain_community.document_loaders.chromium import AsyncChromiumLoader from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores.chroma import Chroma +from langchain.schema import Document +from playwright.async_api import async_playwright -class DocReader(): - """ Document Reader """ - docs_retriever = None - t_docs = None +logging.basicConfig(level=logging.INFO) - def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''): +class DocReader: + """ Document creator and reader class """ + def __init__(self, main_model, st_model='sentence-transformers/all-mpnet-base-v2', base_url=''): self.embed_model = HuggingFaceEmbeddings(model_name=st_model) - self.url = url - self.loader = AsyncChromiumLoader([url]) + self.base_url = base_url self.model = main_model + self.visited = set() + self.docs_retriever = None + # Disable telemetry for ChromaDB + os.environ["ANONYMIZED_TELEMETRY"] = "False" def create_db(self, docs): - """ Create a verctor database from docs """ - vector_store = Chroma.from_documents( - documents=docs, + """ Create vector database and retriever """ + if not docs: + logging.warning("No documents to create database from.") + return + + texts = [doc.page_content for doc in docs] + metadatas = [doc.metadata for doc in docs] + ids = [str(uuid.uuid4()) for _ in docs] + + logging.info("Creating database with %d documents.", len(docs)) + + vector_store = Chroma.from_texts( + texts=texts, + metadatas=metadatas, + ids=ids, embedding=self.embed_model, - ) + ) + + self.docs_retriever = vector_store.as_retriever() - self.docs_retriever = MultiQueryRetriever.from_llm( - llm=self.model, - retriever=vector_store.as_retriever(), - ) + def extract_content(self, html): + """ Extract content without webpage elements """ + soup = BeautifulSoup(html, 'html.parser') + # Remove script and style elements + for script in soup(['script', 'style']): + script.decompose() + # Extract text content + return soup.get_text(separator=' ', strip=True) - # Experimental - def extract_links(self, html): - """ Extract links from this webpage """ + def extract_links(self, html, url): + """ Extract links from the webpage """ soup = BeautifulSoup(html, 'html.parser') - links = [link.get('href') for link in soup.find_all('a')] + links = [] + for link in soup.find_all('a', href=True): + href = link['href'] + full_url = urljoin(url, href) + if full_url.startswith(self.base_url) and full_url not in self.visited: + links.append(full_url) return links - async def crawl_and_load(self, url, visited=None): - """ Crawl and load contents of the URL """ - if visited is None: - visited = set() + async def crawl_and_load(self, url, page): + """ Web Crawler """ + if url in self.visited: + return [] + + self.visited.add(url) + + try: + await page.goto(url, wait_until="networkidle") + content = await page.content() + except Exception as e: + logging.error("Error loading %s: %s", url, str(e)) + return [] + + text_content = self.extract_content(content) + links = self.extract_links(content, url) + + doc = Document(page_content=text_content, metadata={"source": url}) + + logging.info("Crawled %s, found %d links", url, len(links)) + + tasks = [self.crawl_and_load(link, page) for link in links] + child_docs = await asyncio.gather(*tasks) + + return [doc] + [d for sublist in child_docs for d in sublist] + + async def process_documentation(self): + """ Entrypoint for reading online documentation """ + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + all_docs = await self.crawl_and_load(self.base_url, page) + + await browser.close() + + logging.info("Total documents crawled: %d", len(all_docs)) + if all_docs: + self.create_db(all_docs) + else: + logging.warning("No documents were crawled.") + return all_docs - visited.add(url) - html_docs = await self.loader.aload() - #content = html_docs[0].page_content - #links = self.extract_links(html_docs) + async def query(self, query: str, num_results: int = 3): + """ Query vector database and retrieve results """ + if not self.docs_retriever: + raise ValueError("Document retriever has not been initialized.\ + Call process_documentation first.") - return html_docs + relevant_docs = self.docs_retriever.invoke(input=query) + return relevant_docs[:num_results] From 01a71d8012817a2a49bd66bc4cff640468fbe054 Mon Sep 17 00:00:00 2001 From: Viraj Malia Date: Mon, 26 Aug 2024 17:16:00 -0700 Subject: [PATCH 2/2] remove unwanted deps --- app/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/app/pyproject.toml b/app/pyproject.toml index 0d0c52d..3e4facd 100644 --- a/app/pyproject.toml +++ b/app/pyproject.toml @@ -26,7 +26,6 @@ dependencies = [ "lark", "termcolor", "playwright", - "html2text", ] [project.scripts]