From 469b50d207848fb3fbe3ee0cc0c63d978455b56e Mon Sep 17 00:00:00 2001 From: Viraj Malia <96500051+virajmalia@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:08:23 -0700 Subject: [PATCH] Enable URL as context (#19) * Enable URL as context * Formatting update --- app/pyproject.toml | 5 ++- app/server.py | 61 ++++++++++++++++++++++++++++-------- app/src/input/input.py | 10 ------ app/src/llama4u.py | 40 +++++++++++++++++------ app/src/parsers/documents.py | 48 ++++++++++++++++++++++++++++ app/src/sources/documents.py | 22 ------------- 6 files changed, 130 insertions(+), 56 deletions(-) delete mode 100644 app/src/input/input.py create mode 100644 app/src/parsers/documents.py delete mode 100644 app/src/sources/documents.py diff --git a/app/pyproject.toml b/app/pyproject.toml index e2ec418..0d0c52d 100644 --- a/app/pyproject.toml +++ b/app/pyproject.toml @@ -23,7 +23,10 @@ dependencies = [ "langchain-core", "langchain-community", "langchain-chroma", - "termcolor" + "lark", + "termcolor", + "playwright", + "html2text", ] [project.scripts] diff --git a/app/server.py b/app/server.py index f16902c..0f59471 100644 --- a/app/server.py +++ b/app/server.py @@ -6,9 +6,11 @@ from langchain.pydantic_v1 import Field from langchain_core.runnables import RunnableLambda from langchain_core.messages import HumanMessage, AIMessage +from langchain_community.document_transformers.html2text import Html2TextTransformer from langserve import CustomUserType from langserve.server import add_routes from app.src.llama4u import Llama4U +from app.src.parsers.documents import DocReader app = FastAPI( title="LangChain Server", @@ -28,35 +30,68 @@ class ChatHistory(CustomUserType): """ Playground Chat Widget """ - chat: List[Union[HumanMessage, AIMessage]] = \ - Field(..., extra={"widget": {"type": "chat", "input": "chat"}}) + chat_history: List[Union[HumanMessage, AIMessage]] = \ + Field(..., extra={"widget": {"type": "chat", "input": "chat_history"}}) -def format_input(input_data: ChatHistory): +async def get_response_from_docs(url): + """ Get a response from a URL page content """ + doc_reader = DocReader(llama4u.llm, url=url) + crawled_data = await doc_reader.crawl_and_load(url) + + h2t = Html2TextTransformer() + t_docs = h2t.transform_documents(crawled_data) + doc_reader.create_db(t_docs) + return doc_reader.docs_retriever.invoke( # type: ignore + input='Read each word carefully and \ + find the relevance with programming and APIs. \ + Summarize the document such that it can be used \ + as a context for future conversations.') + +async def format_input(input_data: ChatHistory, config): """ Format input from the langserve UI """ - chat_history = input_data.chat + chat_history = input_data.chat_history + response = None + md = config.get('metadata') + if md: + url = md.get('doc_url') + if url: + response = await get_response_from_docs(url) msg_history = [] + if response and len(msg_history) == 0: + # Append the response from docs query to message history + # Only do it the first time + msg_history.append( + HumanMessage( + content=f"Use this document \ + as context for the rest of our conversation: \ + {response[0].page_content}" + ) + ) + for message in chat_history: if isinstance(message, HumanMessage): - msg_history.append({"role": "user", "content": message.content}) + msg_history.append(HumanMessage(content=message.content)) elif isinstance(message, AIMessage): - msg_history.append({"role": "assistant", "content": message.content}) + msg_history.append(AIMessage(content=message.content)) - input_data = {"input": msg_history} + input_data = {"input": msg_history, "chat_history": msg_history} # type: ignore return input_data -def format_output(response_data): - """ Format output from the Chain for the langserve UI """ - return response_data.content - llama4u = Llama4U() + input_formatter = RunnableLambda(format_input) chat_model = input_formatter | llama4u.with_msg_history + add_routes( app, - chat_model.with_types(input_type=ChatHistory, output_type=ChatHistory), - config_keys=["configurable"], + chat_model.with_types( + input_type=ChatHistory, + output_type=ChatHistory, + ).with_config( + configurable={"doc_url": "doc_url"}), + config_keys=["configurable", "doc_url"], path="/llama4u", ) diff --git a/app/src/input/input.py b/app/src/input/input.py deleted file mode 100644 index ed43209..0000000 --- a/app/src/input/input.py +++ /dev/null @@ -1,10 +0,0 @@ -import importlib.metadata -import argparse - -def parse_arguments(): - """ parse input arguments """ - version = importlib.metadata.version('Llama4U') - parser = argparse.ArgumentParser(description=f'Llama4U v{version}') - parser.add_argument('-q', '--query', type=str, required=False, help='Single Query') - parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output') - return parser.parse_args() diff --git a/app/src/llama4u.py b/app/src/llama4u.py index 0efda92..bd2b827 100755 --- a/app/src/llama4u.py +++ b/app/src/llama4u.py @@ -1,48 +1,68 @@ """ Llama4U """ import asyncio +import argparse +import importlib.metadata from termcolor import colored from langchain_community.chat_models.ollama import ChatOllama from langchain_community.chat_message_histories.in_memory import ChatMessageHistory from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.runnables.history import RunnableWithMessageHistory -from input.input import parse_arguments +from langchain_core.runnables.utils import ConfigurableFieldSpec LLAMA4U_STR = 'Llama4U' +def parse_arguments(): + """ parse input arguments """ + version = importlib.metadata.version('Llama4U') + parser = argparse.ArgumentParser(description=f'Llama4U v{version}') + parser.add_argument('-q', '--query', type=str, required=False, help='Single Query') + parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output') + return parser.parse_args() + class Llama4U(): """ Llama4U """ - system_prompt = """Given a chat history and the latest user question \ - which might reference context in the chat history, \ - formulate a response that is clear and understandable by an 18yo human.""" + system_prompt = """You are a helpful AI assistant named Llama4U.""" prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), MessagesPlaceholder("chat_history"), - ("human", "{input}"), + ('human', "{input}"), ] ) llm = ChatOllama(model='llama3') runnable = prompt | llm + doc_url = None + store = {} def __init__(self): # Initialize LLM chat chain self.with_msg_history = RunnableWithMessageHistory( - runnable=self.runnable, + runnable=self.runnable, # type: ignore get_session_history=self.get_session_history, input_messages_key="input", history_messages_key="chat_history", + history_factory_config=[ + ConfigurableFieldSpec( + id="doc_url", + annotation=str, + name="Web URL", + description="Base URL of the webpage to use as context.", + default="", + is_shared=True, + ), + ], ) - def get_session_history(self, session_id): + def get_session_history(self, doc_url): """ Get session history from session_id """ - if session_id not in self.store: - self.store[session_id] = ChatMessageHistory() - return self.store[session_id] + self.doc_url = doc_url + self.store[0] = ChatMessageHistory() + return self.store[0] async def chat_session(self): """ Chat session with history """ diff --git a/app/src/parsers/documents.py b/app/src/parsers/documents.py new file mode 100644 index 0000000..59b10e6 --- /dev/null +++ b/app/src/parsers/documents.py @@ -0,0 +1,48 @@ +""" Document Parser """ +from bs4 import BeautifulSoup +from langchain.retrievers.multi_query import MultiQueryRetriever +from langchain_community.document_loaders.chromium import AsyncChromiumLoader +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings +from langchain_community.vectorstores.chroma import Chroma + +class DocReader(): + """ Document Reader """ + docs_retriever = None + t_docs = None + + def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''): + self.embed_model = HuggingFaceEmbeddings(model_name=st_model) + self.url = url + self.loader = AsyncChromiumLoader([url]) + self.model = main_model + + def create_db(self, docs): + """ Create a verctor database from docs """ + vector_store = Chroma.from_documents( + documents=docs, + embedding=self.embed_model, + ) + + self.docs_retriever = MultiQueryRetriever.from_llm( + llm=self.model, + retriever=vector_store.as_retriever(), + ) + + # Experimental + def extract_links(self, html): + """ Extract links from this webpage """ + soup = BeautifulSoup(html, 'html.parser') + links = [link.get('href') for link in soup.find_all('a')] + return links + + async def crawl_and_load(self, url, visited=None): + """ Crawl and load contents of the URL """ + if visited is None: + visited = set() + + visited.add(url) + html_docs = await self.loader.aload() + #content = html_docs[0].page_content + #links = self.extract_links(html_docs) + + return html_docs diff --git a/app/src/sources/documents.py b/app/src/sources/documents.py deleted file mode 100644 index 77bbeb1..0000000 --- a/app/src/sources/documents.py +++ /dev/null @@ -1,22 +0,0 @@ -from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings -from llama_index.core.readers import SimpleDirectoryReader -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.service_context import ServiceContext -from llama_index.core.settings import Settings - -class DocReader(): - def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', directory_path='/mnt/c/Users/viraj/Documents/ai_db/'): - self.embed_model = HuggingFaceEmbeddings(model_name=st_model) - - self.directory_path = directory_path - reader = SimpleDirectoryReader(directory_path) - docs = reader.load_data() - - Settings.llm = main_model - Settings.context_window = 8000 - service_context = ServiceContext.from_defaults(embed_model=self.embed_model) - self.index = VectorStoreIndex.from_documents(docs, service_context=service_context) - - def get_query_engine(self, model): - query_engine = self.index.as_query_engine(model) - return query_engine