Enable URL as context (#19)

* Enable URL as context * Formatting update
virajmalia · Jun 15, 2024 · 469b50d · 469b50d
1 parent 6e1325c
commit 469b50d
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 56 deletions.
diff --git a/app/pyproject.toml b/app/pyproject.toml
@@ -23,7 +23,10 @@ dependencies = [
     "langchain-core",
     "langchain-community",
     "langchain-chroma",
-    "termcolor"
+    "lark",
+    "termcolor",
+    "playwright",
+    "html2text",
 ]
 
 [project.scripts]

diff --git a/app/server.py b/app/server.py
@@ -6,9 +6,11 @@
 from langchain.pydantic_v1 import Field
 from langchain_core.runnables import RunnableLambda
 from langchain_core.messages import HumanMessage, AIMessage
+from langchain_community.document_transformers.html2text import Html2TextTransformer
 from langserve import CustomUserType
 from langserve.server import add_routes
 from app.src.llama4u import Llama4U
+from app.src.parsers.documents import DocReader
 
 app = FastAPI(
     title="LangChain Server",
@@ -28,35 +30,68 @@
 
 class ChatHistory(CustomUserType):
     """ Playground Chat Widget """
-    chat: List[Union[HumanMessage, AIMessage]] = \
-        Field(..., extra={"widget": {"type": "chat", "input": "chat"}})
+    chat_history: List[Union[HumanMessage, AIMessage]] = \
+        Field(..., extra={"widget": {"type": "chat", "input": "chat_history"}})
 
-def format_input(input_data: ChatHistory):
+async def get_response_from_docs(url):
+    """ Get a response from a URL page content """
+    doc_reader = DocReader(llama4u.llm, url=url)
+    crawled_data = await doc_reader.crawl_and_load(url)
+
+    h2t = Html2TextTransformer()
+    t_docs = h2t.transform_documents(crawled_data)
+    doc_reader.create_db(t_docs)
+    return doc_reader.docs_retriever.invoke( # type: ignore
+        input='Read each word carefully and \
+            find the relevance with programming and APIs. \
+            Summarize the document such that it can be used \
+            as a context for future conversations.')
+
+async def format_input(input_data: ChatHistory, config):
     """ Format input from the langserve UI """
-    chat_history = input_data.chat
+    chat_history = input_data.chat_history
+    response = None
+    md = config.get('metadata')
+    if md:
+        url = md.get('doc_url')
+        if url:
+            response = await get_response_from_docs(url)
 
     msg_history = []
+    if response and len(msg_history) == 0:
+        # Append the response from docs query to message history
+        # Only do it the first time
+        msg_history.append(
+            HumanMessage(
+                content=f"Use this document \
+                    as context for the rest of our conversation: \
+                    {response[0].page_content}"
+            )
+        )
+
     for message in chat_history:
         if isinstance(message, HumanMessage):
-            msg_history.append({"role": "user", "content": message.content})
+            msg_history.append(HumanMessage(content=message.content))
         elif isinstance(message, AIMessage):
-            msg_history.append({"role": "assistant", "content": message.content})
+            msg_history.append(AIMessage(content=message.content))
 
-    input_data = {"input": msg_history}
+    input_data = {"input": msg_history, "chat_history": msg_history} # type: ignore
 
     return input_data
 
-def format_output(response_data):
-    """ Format output from the Chain for the langserve UI """
-    return response_data.content
-
 llama4u = Llama4U()
+
 input_formatter = RunnableLambda(format_input)
 chat_model = input_formatter | llama4u.with_msg_history
+
 add_routes(
     app,
-    chat_model.with_types(input_type=ChatHistory, output_type=ChatHistory),
-    config_keys=["configurable"],
+    chat_model.with_types(
+        input_type=ChatHistory,
+        output_type=ChatHistory,
+        ).with_config(
+            configurable={"doc_url": "doc_url"}),
+    config_keys=["configurable", "doc_url"],
     path="/llama4u",
 )
 

diff --git a/app/src/input/input.py b/app/src/input/input.py
diff --git a/app/src/llama4u.py b/app/src/llama4u.py
@@ -1,48 +1,68 @@
 """ Llama4U """
 import asyncio
+import argparse
+import importlib.metadata
 from termcolor import colored
 from langchain_community.chat_models.ollama import ChatOllama
 from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
 from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables.history import RunnableWithMessageHistory
-from input.input import parse_arguments
+from langchain_core.runnables.utils import ConfigurableFieldSpec
 
 LLAMA4U_STR = 'Llama4U'
 
+def parse_arguments():
+    """ parse input arguments """
+    version = importlib.metadata.version('Llama4U')
+    parser = argparse.ArgumentParser(description=f'Llama4U v{version}')
+    parser.add_argument('-q', '--query', type=str, required=False, help='Single Query')
+    parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output')
+    return parser.parse_args()
+
 class Llama4U():
     """ Llama4U """
 
-    system_prompt = """Given a chat history and the latest user question \
-            which might reference context in the chat history, \
-            formulate a response that is clear and understandable by an 18yo human."""
+    system_prompt = """You are a helpful AI assistant named Llama4U."""
     prompt = ChatPromptTemplate.from_messages(
         [
             ("system", system_prompt),
             MessagesPlaceholder("chat_history"),
-            ("human", "{input}"),
+            ('human', "{input}"),
         ]
     )
 
     llm = ChatOllama(model='llama3')
     runnable = prompt | llm
 
+    doc_url = None
+
     store = {}
 
     def __init__(self):
         # Initialize LLM chat chain
         self.with_msg_history = RunnableWithMessageHistory(
-            runnable=self.runnable,
+            runnable=self.runnable, # type: ignore
             get_session_history=self.get_session_history,
             input_messages_key="input",
             history_messages_key="chat_history",
+            history_factory_config=[
+                ConfigurableFieldSpec(
+                    id="doc_url",
+                    annotation=str,
+                    name="Web URL",
+                    description="Base URL of the webpage to use as context.",
+                    default="",
+                    is_shared=True,
+                ),
+                ],
             )
 
-    def get_session_history(self, session_id):
+    def get_session_history(self, doc_url):
         """ Get session history from session_id """
-        if session_id not in self.store:
-            self.store[session_id] = ChatMessageHistory()
-        return self.store[session_id]
+        self.doc_url = doc_url
+        self.store[0] = ChatMessageHistory()
+        return self.store[0]
 
     async def chat_session(self):
         """ Chat session with history """

diff --git a/app/src/parsers/documents.py b/app/src/parsers/documents.py
@@ -0,0 +1,48 @@
+""" Document Parser """
+from bs4 import BeautifulSoup
+from langchain.retrievers.multi_query import MultiQueryRetriever
+from langchain_community.document_loaders.chromium import AsyncChromiumLoader
+from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores.chroma import Chroma
+
+class DocReader():
+    """ Document Reader """
+    docs_retriever = None
+    t_docs = None
+
+    def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''):
+        self.embed_model = HuggingFaceEmbeddings(model_name=st_model)
+        self.url = url
+        self.loader = AsyncChromiumLoader([url])
+        self.model = main_model
+
+    def create_db(self, docs):
+        """ Create a verctor database from docs """
+        vector_store = Chroma.from_documents(
+            documents=docs,
+            embedding=self.embed_model,
+            )
+
+        self.docs_retriever = MultiQueryRetriever.from_llm(
+            llm=self.model,
+            retriever=vector_store.as_retriever(),
+            )
+
+    # Experimental
+    def extract_links(self, html):
+        """ Extract links from this webpage """
+        soup = BeautifulSoup(html, 'html.parser')
+        links = [link.get('href') for link in soup.find_all('a')]
+        return links
+
+    async def crawl_and_load(self, url, visited=None):
+        """ Crawl and load contents of the URL """
+        if visited is None:
+            visited = set()
+
+        visited.add(url)
+        html_docs = await self.loader.aload()
+        #content = html_docs[0].page_content
+        #links = self.extract_links(html_docs)
+
+        return html_docs
diff --git a/app/src/sources/documents.py b/app/src/sources/documents.py