Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable URL as context #19

Merged
merged 2 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ dependencies = [
"langchain-core",
"langchain-community",
"langchain-chroma",
"termcolor"
"lark",
"termcolor",
"playwright",
"html2text",
]

[project.scripts]
Expand Down
61 changes: 48 additions & 13 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from langchain.pydantic_v1 import Field
from langchain_core.runnables import RunnableLambda
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.document_transformers.html2text import Html2TextTransformer
from langserve import CustomUserType
from langserve.server import add_routes
from app.src.llama4u import Llama4U
from app.src.parsers.documents import DocReader

app = FastAPI(
title="LangChain Server",
Expand All @@ -28,35 +30,68 @@

class ChatHistory(CustomUserType):
""" Playground Chat Widget """
chat: List[Union[HumanMessage, AIMessage]] = \
Field(..., extra={"widget": {"type": "chat", "input": "chat"}})
chat_history: List[Union[HumanMessage, AIMessage]] = \
Field(..., extra={"widget": {"type": "chat", "input": "chat_history"}})

def format_input(input_data: ChatHistory):
async def get_response_from_docs(url):
""" Get a response from a URL page content """
doc_reader = DocReader(llama4u.llm, url=url)
crawled_data = await doc_reader.crawl_and_load(url)

h2t = Html2TextTransformer()
t_docs = h2t.transform_documents(crawled_data)
doc_reader.create_db(t_docs)
return doc_reader.docs_retriever.invoke( # type: ignore
input='Read each word carefully and \
find the relevance with programming and APIs. \
Summarize the document such that it can be used \
as a context for future conversations.')

async def format_input(input_data: ChatHistory, config):
""" Format input from the langserve UI """
chat_history = input_data.chat
chat_history = input_data.chat_history
response = None
md = config.get('metadata')
if md:
url = md.get('doc_url')
if url:
response = await get_response_from_docs(url)

msg_history = []
if response and len(msg_history) == 0:
# Append the response from docs query to message history
# Only do it the first time
msg_history.append(
HumanMessage(
content=f"Use this document \
as context for the rest of our conversation: \
{response[0].page_content}"
)
)

for message in chat_history:
if isinstance(message, HumanMessage):
msg_history.append({"role": "user", "content": message.content})
msg_history.append(HumanMessage(content=message.content))
elif isinstance(message, AIMessage):
msg_history.append({"role": "assistant", "content": message.content})
msg_history.append(AIMessage(content=message.content))

input_data = {"input": msg_history}
input_data = {"input": msg_history, "chat_history": msg_history} # type: ignore

return input_data

def format_output(response_data):
""" Format output from the Chain for the langserve UI """
return response_data.content

llama4u = Llama4U()

input_formatter = RunnableLambda(format_input)
chat_model = input_formatter | llama4u.with_msg_history

add_routes(
app,
chat_model.with_types(input_type=ChatHistory, output_type=ChatHistory),
config_keys=["configurable"],
chat_model.with_types(
input_type=ChatHistory,
output_type=ChatHistory,
).with_config(
configurable={"doc_url": "doc_url"}),
config_keys=["configurable", "doc_url"],
path="/llama4u",
)

Expand Down
10 changes: 0 additions & 10 deletions app/src/input/input.py

This file was deleted.

40 changes: 30 additions & 10 deletions app/src/llama4u.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,68 @@
""" Llama4U """
import asyncio
import argparse
import importlib.metadata
from termcolor import colored
from langchain_community.chat_models.ollama import ChatOllama
from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from input.input import parse_arguments
from langchain_core.runnables.utils import ConfigurableFieldSpec

LLAMA4U_STR = 'Llama4U'

def parse_arguments():
""" parse input arguments """
version = importlib.metadata.version('Llama4U')
parser = argparse.ArgumentParser(description=f'Llama4U v{version}')
parser.add_argument('-q', '--query', type=str, required=False, help='Single Query')
parser.add_argument('-v', '--verbose', type=int, required=False, help='Enable verbose output')
return parser.parse_args()

class Llama4U():
""" Llama4U """

system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, \
formulate a response that is clear and understandable by an 18yo human."""
system_prompt = """You are a helpful AI assistant named Llama4U."""
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
('human', "{input}"),
]
)

llm = ChatOllama(model='llama3')
runnable = prompt | llm

doc_url = None

store = {}

def __init__(self):
# Initialize LLM chat chain
self.with_msg_history = RunnableWithMessageHistory(
runnable=self.runnable,
runnable=self.runnable, # type: ignore
get_session_history=self.get_session_history,
input_messages_key="input",
history_messages_key="chat_history",
history_factory_config=[
ConfigurableFieldSpec(
id="doc_url",
annotation=str,
name="Web URL",
description="Base URL of the webpage to use as context.",
default="",
is_shared=True,
),
],
)

def get_session_history(self, session_id):
def get_session_history(self, doc_url):
""" Get session history from session_id """
if session_id not in self.store:
self.store[session_id] = ChatMessageHistory()
return self.store[session_id]
self.doc_url = doc_url
self.store[0] = ChatMessageHistory()
return self.store[0]

async def chat_session(self):
""" Chat session with history """
Expand Down
48 changes: 48 additions & 0 deletions app/src/parsers/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
""" Document Parser """
from bs4 import BeautifulSoup
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.document_loaders.chromium import AsyncChromiumLoader
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma

class DocReader():
""" Document Reader """
docs_retriever = None
t_docs = None

def __init__(self, main_model, st_model='mixedbread-ai/mxbai-embed-large-v1', url=''):
self.embed_model = HuggingFaceEmbeddings(model_name=st_model)
self.url = url
self.loader = AsyncChromiumLoader([url])
self.model = main_model

def create_db(self, docs):
""" Create a verctor database from docs """
vector_store = Chroma.from_documents(
documents=docs,
embedding=self.embed_model,
)

self.docs_retriever = MultiQueryRetriever.from_llm(
llm=self.model,
retriever=vector_store.as_retriever(),
)

# Experimental
def extract_links(self, html):
""" Extract links from this webpage """
soup = BeautifulSoup(html, 'html.parser')
links = [link.get('href') for link in soup.find_all('a')]
return links

async def crawl_and_load(self, url, visited=None):
""" Crawl and load contents of the URL """
if visited is None:
visited = set()

visited.add(url)
html_docs = await self.loader.aload()
#content = html_docs[0].page_content
#links = self.extract_links(html_docs)

return html_docs
22 changes: 0 additions & 22 deletions app/src/sources/documents.py

This file was deleted.

Loading