diff --git a/config/chat-vectorstore-qa.yml b/config/chat-vectorstore-qa.yml index 62802bc..e46ef73 100644 --- a/config/chat-vectorstore-qa.yml +++ b/config/chat-vectorstore-qa.yml @@ -17,6 +17,7 @@ llm: Helpful answer: vector: + # vector_path: http://qdrant:6333 vector_path: ./vectorstore/db_faiss # Path to the vectorstore to do QA retrieval vector_download: null embeddings_path: ./embeddings/all-MiniLM-L6-v2 # Embeddings used to generate the vectors. To use from HF: sentence-transformers/all-MiniLM-L6-v2 diff --git a/docker-compose.yml b/docker-compose.yml index e4a21cf..40dc40f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,7 +36,7 @@ services: - HTTPS_PROXY=http://proxy.unimaas.nl:3128 - http_proxy=http://proxy.unimaas.nl:3128 - https_proxy=http://proxy.unimaas.nl:3128 - - NO_PROXY=127.0.0.1,localhost,137.120.0.0/16 + - NO_PROXY=127.0.0.1,localhost,137.120.0.0/16,qdrant # Containers deployed publicly need to be on the nginx network networks: - nginx @@ -58,6 +58,8 @@ services: # # - 6333:6333 # # command: # # - ./qdrant --config-path /qdrant/qdrant_config.yml + # networks: + # - nginx # Also required to deploy containers publicly diff --git a/src/libre_chat/llm.py b/src/libre_chat/llm.py index 07772d2..6776c84 100644 --- a/src/libre_chat/llm.py +++ b/src/libre_chat/llm.py @@ -1,36 +1,18 @@ """Module: Open-source LLM setup""" import os -from datetime import datetime -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import torch from langchain.chains import ConversationChain, RetrievalQA from langchain.memory import ConversationBufferMemory from langchain.prompts import PromptTemplate -from langchain.schema.document import Document -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import ( - CSVLoader, - DirectoryLoader, - EverNoteLoader, - JSONLoader, - PyPDFLoader, - TextLoader, - UnstructuredEmailLoader, - UnstructuredEPubLoader, - UnstructuredExcelLoader, - UnstructuredHTMLLoader, - UnstructuredMarkdownLoader, - UnstructuredODTLoader, - UnstructuredPowerPointLoader, - UnstructuredWordDocumentLoader, -) from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import LlamaCpp from langchain_community.vectorstores import FAISS from libre_chat.conf import ChatConf, default_conf -from libre_chat.utils import BOLD, CYAN, END, log, parallel_download +from libre_chat.utils import BOLD, END, log, parallel_download +from libre_chat.vectorstore import DEFAULT_DOCUMENT_LOADERS, build_vectorstore __all__ = [ "Llm", @@ -96,9 +78,9 @@ def __init__( self.download_data() if self.vector_path: - self.build_vectorstore() + build_vectorstore(self.conf, self.document_loaders, self.device, self.vector_path) # if self.vector_path and not self.has_vectorstore(): - # self.build_vectorstore() + # build_vectorstore(self.conf, self.vector_path, self.device) # else: # log.info(f"♻️ Reusing existing vectorstore at {BOLD}{self.vector_path}{END}, skip building the vectorstore") @@ -169,7 +151,7 @@ def setup_dbqa(self) -> None: # FAISS should automatically use GPU? vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings) # vectorstore = Qdrant( - # QdrantClient(host=self.conf.vector.vector_path, prefer_grpc=True), + # QdrantClient(url=self.conf.vector.vector_path, prefer_grpc=True), # collection_name="libre_chat_rag", # embeddings=embeddings, # ) @@ -187,61 +169,6 @@ def setup_dbqa(self) -> None: chain_type_kwargs={"prompt": self.prompt}, ) - def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FAISS]: - """Build vectorstore from documents.""" - # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py - time_start = datetime.now() - documents_path = documents_path if documents_path else self.conf.vector.documents_path - docs_count = len(os.listdir(documents_path)) - if docs_count < 1: - log.warning( - f"⚠️ No documents found in {documents_path}, vectorstore will not be built, and a generic chatbot will be used until documents are added" - ) - else: - log.info( - f"🏗️ Building the vectorstore from the {BOLD}{CYAN}{docs_count}{END} documents found in {BOLD}{documents_path}{END}, using embeddings from {BOLD}{self.conf.vector.embeddings_path}{END}" - ) - documents: List[Document] = [] - # Loading all file types provided in the document_loaders object - for doc_load in self.document_loaders: - loader = DirectoryLoader( - documents_path, - glob=doc_load["glob"], - loader_cls=doc_load["loader_cls"], - loader_kwargs=doc_load["loader_kwargs"] if "loader_kwargs" in doc_load else {}, - ) - loaded_docs = loader.load() - if len(loaded_docs) > 0: - log.info(f"🗃️ Loaded {len(loaded_docs)} items from {doc_load['glob']} files") - documents.extend(loaded_docs) - - # Split the text up into small, semantically meaningful chunks (often sentences) https://js.langchain.com/docs/modules/data_connection/document_transformers/ - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=self.conf.vector.chunk_size, chunk_overlap=self.conf.vector.chunk_overlap - ) - splitted_texts = text_splitter.split_documents(documents) - # TODO: use fastembed? - embeddings = HuggingFaceEmbeddings( - model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device} - ) - # TODO: use Qdrant vectorstore - # os.makedirs(str(self.conf.vector.vector_path), exist_ok=True) - # vectorstore = Qdrant.from_documents( - # splitted_texts, - # embeddings, - # # path=self.conf.vector.vector_path, - # host=self.conf.vector.vector_path, - # collection_name="libre_chat_rag", - # prefer_grpc=True, - # # force_recreate=True, - # ) - vectorstore = FAISS.from_documents(splitted_texts, embeddings) - if self.vector_path: - vectorstore.save_local(self.vector_path) - log.info(f"✅ Vectorstore built in {datetime.now() - time_start}") - return vectorstore - return None - def query( self, prompt: str, @@ -383,30 +310,3 @@ async def aquery( Only return the helpful answer below and nothing else. Helpful answer: """ - -DEFAULT_DOCUMENT_LOADERS: List[Dict[str, Union[str, Any]]] = [ - {"glob": "*.pdf", "loader_cls": PyPDFLoader}, - {"glob": "*.csv", "loader_cls": CSVLoader, "loader_kwargs": {"encoding": "utf8"}}, - { - "glob": "*.tsv", - "loader_cls": CSVLoader, - "loader_kwargs": {"encoding": "utf8", "delimiter": "\t"}, - }, - { - "glob": "*.psv", - "loader_cls": CSVLoader, - "loader_kwargs": {"encoding": "utf8", "delimiter": "\\p"}, - }, - {"glob": "*.xls?x", "loader_cls": UnstructuredExcelLoader}, - {"glob": "*.?xhtm?l", "loader_cls": UnstructuredHTMLLoader}, - {"glob": "*.xml", "loader_cls": UnstructuredHTMLLoader}, - {"glob": "*.json*", "loader_cls": JSONLoader}, - {"glob": "*.md*", "loader_cls": UnstructuredMarkdownLoader}, - {"glob": "*.txt", "loader_cls": TextLoader, "loader_kwargs": {"encoding": "utf8"}}, - {"glob": "*.doc?x", "loader_cls": UnstructuredWordDocumentLoader}, - {"glob": "*.odt", "loader_cls": UnstructuredODTLoader}, - {"glob": "*.ppt?x", "loader_cls": UnstructuredPowerPointLoader}, - {"glob": "*.epub", "loader_cls": UnstructuredEPubLoader}, - {"glob": "*.eml", "loader_cls": UnstructuredEmailLoader}, - {"glob": "*.enex", "loader_cls": EverNoteLoader}, -] diff --git a/src/libre_chat/vectorstore.py b/src/libre_chat/vectorstore.py new file mode 100644 index 0000000..b3550e7 --- /dev/null +++ b/src/libre_chat/vectorstore.py @@ -0,0 +1,115 @@ +"""Module: Open-source LLM setup""" +import os +from datetime import datetime +from typing import Any, Dict, List, Optional, Union + +from langchain.schema.document import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import ( + CSVLoader, + DirectoryLoader, + EverNoteLoader, + JSONLoader, + PyPDFLoader, + TextLoader, + UnstructuredEmailLoader, + UnstructuredEPubLoader, + UnstructuredExcelLoader, + UnstructuredHTMLLoader, + UnstructuredMarkdownLoader, + UnstructuredODTLoader, + UnstructuredPowerPointLoader, + UnstructuredWordDocumentLoader, +) +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS + +from libre_chat.conf import ChatConf +from libre_chat.utils import BOLD, CYAN, END, log + + +def build_vectorstore( + conf: ChatConf, document_loaders: Any, device: Any, vector_path: Optional[str] = None +) -> Optional[FAISS]: + """Build vectorstore from documents.""" + # NOTE: Using Qdrant blocked by UM proxy... + # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py + time_start = datetime.now() + documents_path = conf.vector.documents_path + docs_count = len(os.listdir(documents_path)) + if docs_count < 1: + log.warning( + f"⚠️ No documents found in {documents_path}, vectorstore will not be built, and a generic chatbot will be used until documents are added" + ) + else: + log.info( + f"🏗️ Building the vectorstore from the {BOLD}{CYAN}{docs_count}{END} documents found in {BOLD}{documents_path}{END}, using embeddings from {BOLD}{conf.vector.embeddings_path}{END}" + ) + documents: List[Document] = [] + # Loading all file types provided in the document_loaders object + for doc_load in document_loaders: + loader = DirectoryLoader( + documents_path, + glob=doc_load["glob"], + loader_cls=doc_load["loader_cls"], + loader_kwargs=doc_load["loader_kwargs"] if "loader_kwargs" in doc_load else {}, + ) + loaded_docs = loader.load() + if len(loaded_docs) > 0: + log.info(f"🗃️ Loaded {len(loaded_docs)} items from {doc_load['glob']} files") + documents.extend(loaded_docs) + + # Split the text up into small, semantically meaningful chunks (often sentences) https://js.langchain.com/docs/modules/data_connection/document_transformers/ + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=conf.vector.chunk_size, chunk_overlap=conf.vector.chunk_overlap + ) + splitted_texts = text_splitter.split_documents(documents) + # TODO: use fastembed? + embeddings = HuggingFaceEmbeddings( + model_name=conf.vector.embeddings_path, model_kwargs={"device": device} + ) + # TODO: use Qdrant vectorstore + # os.makedirs(str(conf.vector.vector_path), exist_ok=True) + # vectorstore = Qdrant.from_documents( + # splitted_texts, + # embeddings, + # # path=conf.vector.vector_path, + # url=conf.vector.vector_path, + # collection_name="libre_chat_rag", + # prefer_grpc=True, + # # force_recreate=True, + # ) + vectorstore = FAISS.from_documents(splitted_texts, embeddings) + if vector_path: + vectorstore.save_local(vector_path) + log.info(f"✅ Vectorstore built in {datetime.now() - time_start}") + return vectorstore + return None + + +DEFAULT_DOCUMENT_LOADERS: List[Dict[str, Union[str, Any]]] = [ + {"glob": "*.pdf", "loader_cls": PyPDFLoader}, + {"glob": "*.csv", "loader_cls": CSVLoader, "loader_kwargs": {"encoding": "utf8"}}, + { + "glob": "*.tsv", + "loader_cls": CSVLoader, + "loader_kwargs": {"encoding": "utf8", "delimiter": "\t"}, + }, + { + "glob": "*.psv", + "loader_cls": CSVLoader, + "loader_kwargs": {"encoding": "utf8", "delimiter": "\\p"}, + }, + {"glob": "*.xls?x", "loader_cls": UnstructuredExcelLoader}, + {"glob": "*.?xhtm?l", "loader_cls": UnstructuredHTMLLoader}, + {"glob": "*.xml", "loader_cls": UnstructuredHTMLLoader}, + {"glob": "*.json*", "loader_cls": JSONLoader}, + {"glob": "*.md*", "loader_cls": UnstructuredMarkdownLoader}, + {"glob": "*.txt", "loader_cls": TextLoader, "loader_kwargs": {"encoding": "utf8"}}, + {"glob": "*.doc?x", "loader_cls": UnstructuredWordDocumentLoader}, + {"glob": "*.odt", "loader_cls": UnstructuredODTLoader}, + {"glob": "*.ppt?x", "loader_cls": UnstructuredPowerPointLoader}, + {"glob": "*.epub", "loader_cls": UnstructuredEPubLoader}, + {"glob": "*.eml", "loader_cls": UnstructuredEmailLoader}, + {"glob": "*.enex", "loader_cls": EverNoteLoader}, +]