Skip to content

Commit

Permalink
Enable URL as context (#19)
Browse files Browse the repository at this point in the history
* Enable URL as context

* Formatting update
  • Loading branch information
virajmalia authored Jun 15, 2024
1 parent 6e1325c commit 469b50d
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 56 deletions.
5 changes: 4 additions & 1 deletion app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ dependencies = [
"langchain-core",
"langchain-community",
"langchain-chroma",
"termcolor"
"lark",
"termcolor",
"playwright",
"html2text",
]

[project.scripts]
Expand Down
61 changes: 48 additions & 13 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from langchain.pydantic_v1 import Field
from langchain_core.runnables import RunnableLambda
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.document_transformers.html2text import Html2TextTransformer
from langserve import CustomUserType
from langserve.server import add_routes
from app.src.llama4u import Llama4U
from app.src.parsers.documents import DocReader

app = FastAPI(
title="LangChain Server",
Expand All @@ -28,35 +30,68 @@

class ChatHistory(CustomUserType):
""" Playground Chat Widget """
chat: List[Union[HumanMessage, AIMessage]] = \
Field(..., extra={"widget": {"type": "chat", "input": "chat"}})
chat_history: List[Union[HumanMessage, AIMessage]] = \
Field(..., extra={"widget": {"type": "chat", "input": "chat_history"}})

def format_input(input_data: ChatHistory):
async def get_response_from_docs(url):
""" Get a response from a URL page content """
doc_reader = DocReader(llama4u.llm, url=url)
crawled_data = await doc_reader.crawl_and_load(url)

h2t = Html2TextTransformer()
t_docs = h2t.transform_documents(crawled_data)
doc_reader.create_db(t_docs)
return doc_reader.docs_retriever.invoke( # type: ignore
input='Read each word carefully and \
find the relevance with programming and APIs. \
Summarize the document such that it can be used \
as a context for future conversations.')

async def format_input(input_data: ChatHistory, config):
""" Format input from the langserve UI """
chat_history = input_data.chat
chat_history = input_data.chat_history
response = None
md = config.get('metadata')
if md:
url = md.get('doc_url')
if url:
response = await get_response_from_docs(url)

msg_history = []
if response and len(msg_history) == 0:
# Append the response from docs query to message history
# Only do it the first time
msg_history.append(
HumanMessage(
content=f"Use this document \
as context for the rest of our conversation: \
{response[0].page_content}"
)
)

for message in chat_history:
if isinstance(message, HumanMessage):
msg_history.append({"role": "user", "content": message.content})
msg_history.append(HumanMessage(content=message.content))
elif isinstance(message, AIMessage):
msg_history.append({"role": "assistant", "content": message.content})
msg_history.append(AIMessage(content=message.content))

input_data = {"input": msg_history}
input_data = {"input": msg_history, "chat_history": msg_history} # type: ignore

return input_data

def format_output(response_data):
""" Format output from the Chain for the langserve UI """
return response_data.content

llama4u = Llama4U()

input_formatter = RunnableLambda(format_input)
chat_model = input_formatter | llama4u.with_msg_history

add_routes(
app,
chat_model.with_types(input_type=ChatHistory, output_type=ChatHistory),
config_keys=["configurable"],
chat_model.with_types(
input_type=ChatHistory,
output_type=ChatHistory,
).with_config(
configurable={"doc_url": "doc_url"}),
config_keys=["configurable", "doc_url"],
path="/llama4u",
)

Expand Down
10 changes: 0 additions & 10 deletions app/src/input/input.py

This file was deleted.

40 changes: 30 additions & 10 deletions app/src/llama4u.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,68 @@
""" Llama4U """
import asyncio
import argparse
import importlib.metadata
from termcolor import colored
from langchain_community.chat_models.ollama import ChatOllama
from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from input.input import parse_arguments
from langchain_core.runnables.utils import ConfigurableFieldSpec

LLAMA4U_STR = 'Llama4U'

def parse_arguments():
""" parse input arguments """
version = importlib.metadata.version('Llama4U')
parser = argparse.ArgumentParser(description=f'Llama4U v{version}')
parser.add_argument('-q', '--query', type=str, required=False, help='Single Query')
parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output')
return parser.parse_args()

class Llama4U():
""" Llama4U """

system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, \
formulate a response that is clear and understandable by an 18yo human."""
system_prompt = """You are a helpful AI assistant named Llama4U."""
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
('human', "{input}"),
]
)

llm = ChatOllama(model='llama3')
runnable = prompt | llm

doc_url = None

store = {}

def __init__(self):
# Initialize LLM chat chain
self.with_msg_history = RunnableWithMessageHistory(
runnable=self.runnable,
runnable=self.runnable, # type: ignore
get_session_history=self.get_session_history,
input_messages_key="input",
history_messages_key="chat_history",
history_factory_config=[
ConfigurableFieldSpec(
id="doc_url",
annotation=str,
name="Web URL",
description="Base URL of the webpage to use as context.",
default="",
is_shared=True,
),
],
)

def get_session_history(self, session_id):
def get_session_history(self, doc_url):
""" Get session history from session_id """
if session_id not in self.store:
self.store[session_id] = ChatMessageHistory()
return self.store[session_id]
self.doc_url = doc_url
self.store[0] = ChatMessageHistory()
return self.store[0]

async def chat_session(self):
""" Chat session with history """
Expand Down
48 changes: 48 additions & 0 deletions app/src/parsers/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
""" Document Parser """
from bs4 import BeautifulSoup
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.document_loaders.chromium import AsyncChromiumLoader
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma

class DocReader():
""" Document Reader """
docs_retriever = None
t_docs = None

def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''):
self.embed_model = HuggingFaceEmbeddings(model_name=st_model)
self.url = url
self.loader = AsyncChromiumLoader([url])
self.model = main_model

def create_db(self, docs):
""" Create a verctor database from docs """
vector_store = Chroma.from_documents(
documents=docs,
embedding=self.embed_model,
)

self.docs_retriever = MultiQueryRetriever.from_llm(
llm=self.model,
retriever=vector_store.as_retriever(),
)

# Experimental
def extract_links(self, html):
""" Extract links from this webpage """
soup = BeautifulSoup(html, 'html.parser')
links = [link.get('href') for link in soup.find_all('a')]
return links

async def crawl_and_load(self, url, visited=None):
""" Crawl and load contents of the URL """
if visited is None:
visited = set()

visited.add(url)
html_docs = await self.loader.aload()
#content = html_docs[0].page_content
#links = self.extract_links(html_docs)

return html_docs
22 changes: 0 additions & 22 deletions app/src/sources/documents.py

This file was deleted.

0 comments on commit 469b50d

Please sign in to comment.