Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perform scraping of webpages without web elements in vectordb #21

Merged
merged 2 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ dependencies = [
"lark",
"termcolor",
"playwright",
"html2text",
]

[project.scripts]
Expand Down
12 changes: 4 additions & 8 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from langchain.pydantic_v1 import Field
from langchain_core.runnables import RunnableLambda
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.document_transformers.html2text import Html2TextTransformer
from langserve import CustomUserType
from langserve.server import add_routes
from app.src.llama4u import Llama4U
Expand Down Expand Up @@ -35,14 +34,12 @@ class ChatHistory(CustomUserType):

async def get_response_from_docs(url):
""" Get a response from a URL page content """
doc_reader = DocReader(llama4u.llm, url=url)
crawled_data = await doc_reader.crawl_and_load(url)
doc_reader = DocReader(llama4u.llm, base_url=url)
docs = await doc_reader.process_documentation()

h2t = Html2TextTransformer()
t_docs = h2t.transform_documents(crawled_data)
doc_reader.create_db(t_docs)
doc_reader.create_db(docs)
return doc_reader.docs_retriever.invoke( # type: ignore
input='Read each word carefully and \
input='Read carefully and \
find the relevance with programming and APIs. \
Summarize the document such that it can be used \
as a context for future conversations.')
Expand Down Expand Up @@ -88,7 +85,6 @@ async def format_input(input_data: ChatHistory, config):
app,
chat_model.with_types(
input_type=ChatHistory,
output_type=ChatHistory,
).with_config(
configurable={"doc_url": "doc_url"}),
config_keys=["configurable", "doc_url"],
Expand Down
130 changes: 99 additions & 31 deletions app/src/parsers/documents.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,116 @@
""" Document Parser """
import os
import asyncio
import uuid
import logging
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.document_loaders.chromium import AsyncChromiumLoader
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain.schema import Document
from playwright.async_api import async_playwright

class DocReader():
""" Document Reader """
docs_retriever = None
t_docs = None
logging.basicConfig(level=logging.INFO)

def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''):
class DocReader:
""" Document creator and reader class """
def __init__(self, main_model, st_model='sentence-transformers/all-mpnet-base-v2', base_url=''):
self.embed_model = HuggingFaceEmbeddings(model_name=st_model)
self.url = url
self.loader = AsyncChromiumLoader([url])
self.base_url = base_url
self.model = main_model
self.visited = set()
self.docs_retriever = None
# Disable telemetry for ChromaDB
os.environ["ANONYMIZED_TELEMETRY"] = "False"

def create_db(self, docs):
""" Create a verctor database from docs """
vector_store = Chroma.from_documents(
documents=docs,
""" Create vector database and retriever """
if not docs:
logging.warning("No documents to create database from.")
return

texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]
ids = [str(uuid.uuid4()) for _ in docs]

logging.info("Creating database with %d documents.", len(docs))

vector_store = Chroma.from_texts(
texts=texts,
metadatas=metadatas,
ids=ids,
embedding=self.embed_model,
)
)

self.docs_retriever = vector_store.as_retriever()

self.docs_retriever = MultiQueryRetriever.from_llm(
llm=self.model,
retriever=vector_store.as_retriever(),
)
def extract_content(self, html):
""" Extract content without webpage elements """
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style elements
for script in soup(['script', 'style']):
script.decompose()
# Extract text content
return soup.get_text(separator=' ', strip=True)

# Experimental
def extract_links(self, html):
""" Extract links from this webpage """
def extract_links(self, html, url):
""" Extract links from the webpage """
soup = BeautifulSoup(html, 'html.parser')
links = [link.get('href') for link in soup.find_all('a')]
links = []
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(url, href)
if full_url.startswith(self.base_url) and full_url not in self.visited:
links.append(full_url)
return links

async def crawl_and_load(self, url, visited=None):
""" Crawl and load contents of the URL """
if visited is None:
visited = set()
async def crawl_and_load(self, url, page):
""" Web Crawler """
if url in self.visited:
return []

self.visited.add(url)

try:
await page.goto(url, wait_until="networkidle")
content = await page.content()
except Exception as e:
logging.error("Error loading %s: %s", url, str(e))
return []

text_content = self.extract_content(content)
links = self.extract_links(content, url)

doc = Document(page_content=text_content, metadata={"source": url})

logging.info("Crawled %s, found %d links", url, len(links))

tasks = [self.crawl_and_load(link, page) for link in links]
child_docs = await asyncio.gather(*tasks)

return [doc] + [d for sublist in child_docs for d in sublist]

async def process_documentation(self):
""" Entrypoint for reading online documentation """
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()

all_docs = await self.crawl_and_load(self.base_url, page)

await browser.close()

logging.info("Total documents crawled: %d", len(all_docs))
if all_docs:
self.create_db(all_docs)
else:
logging.warning("No documents were crawled.")
return all_docs

visited.add(url)
html_docs = await self.loader.aload()
#content = html_docs[0].page_content
#links = self.extract_links(html_docs)
async def query(self, query: str, num_results: int = 3):
""" Query vector database and retrieve results """
if not self.docs_retriever:
raise ValueError("Document retriever has not been initialized.\
Call process_documentation first.")

return html_docs
relevant_docs = self.docs_retriever.invoke(input=query)
return relevant_docs[:num_results]
Loading