Skip to content

Commit

Permalink
Move build_vectorstore to a new file
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Jan 11, 2024
1 parent af9d5af commit e55a50f
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 107 deletions.
1 change: 1 addition & 0 deletions config/chat-vectorstore-qa.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ llm:
Helpful answer:
vector:
# vector_path: http://qdrant:6333
vector_path: ./vectorstore/db_faiss # Path to the vectorstore to do QA retrieval
vector_download: null
embeddings_path: ./embeddings/all-MiniLM-L6-v2 # Embeddings used to generate the vectors. To use from HF: sentence-transformers/all-MiniLM-L6-v2
Expand Down
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ services:
- HTTPS_PROXY=http://proxy.unimaas.nl:3128
- http_proxy=http://proxy.unimaas.nl:3128
- https_proxy=http://proxy.unimaas.nl:3128
- NO_PROXY=127.0.0.1,localhost,137.120.0.0/16
- NO_PROXY=127.0.0.1,localhost,137.120.0.0/16,qdrant
# Containers deployed publicly need to be on the nginx network
networks:
- nginx
Expand All @@ -58,6 +58,8 @@ services:
# # - 6333:6333
# # command:
# # - ./qdrant --config-path /qdrant/qdrant_config.yml
# networks:
# - nginx


# Also required to deploy containers publicly
Expand Down
112 changes: 6 additions & 106 deletions src/libre_chat/llm.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,18 @@
"""Module: Open-source LLM setup"""
import os
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

import torch
from langchain.chains import ConversationChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
CSVLoader,
DirectoryLoader,
EverNoteLoader,
JSONLoader,
PyPDFLoader,
TextLoader,
UnstructuredEmailLoader,
UnstructuredEPubLoader,
UnstructuredExcelLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp
from langchain_community.vectorstores import FAISS

from libre_chat.conf import ChatConf, default_conf
from libre_chat.utils import BOLD, CYAN, END, log, parallel_download
from libre_chat.utils import BOLD, END, log, parallel_download
from libre_chat.vectorstore import DEFAULT_DOCUMENT_LOADERS, build_vectorstore

__all__ = [
"Llm",
Expand Down Expand Up @@ -96,9 +78,9 @@ def __init__(

self.download_data()
if self.vector_path:
self.build_vectorstore()
build_vectorstore(self.conf, self.document_loaders, self.device, self.vector_path)
# if self.vector_path and not self.has_vectorstore():
# self.build_vectorstore()
# build_vectorstore(self.conf, self.vector_path, self.device)
# else:
# log.info(f"♻️ Reusing existing vectorstore at {BOLD}{self.vector_path}{END}, skip building the vectorstore")

Expand Down Expand Up @@ -169,7 +151,7 @@ def setup_dbqa(self) -> None:
# FAISS should automatically use GPU?
vectorstore = FAISS.load_local(self.get_vectorstore(), embeddings)
# vectorstore = Qdrant(
# QdrantClient(host=self.conf.vector.vector_path, prefer_grpc=True),
# QdrantClient(url=self.conf.vector.vector_path, prefer_grpc=True),
# collection_name="libre_chat_rag",
# embeddings=embeddings,
# )
Expand All @@ -187,61 +169,6 @@ def setup_dbqa(self) -> None:
chain_type_kwargs={"prompt": self.prompt},
)

def build_vectorstore(self, documents_path: Optional[str] = None) -> Optional[FAISS]:
"""Build vectorstore from documents."""
# https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py
time_start = datetime.now()
documents_path = documents_path if documents_path else self.conf.vector.documents_path
docs_count = len(os.listdir(documents_path))
if docs_count < 1:
log.warning(
f"⚠️ No documents found in {documents_path}, vectorstore will not be built, and a generic chatbot will be used until documents are added"
)
else:
log.info(
f"🏗️ Building the vectorstore from the {BOLD}{CYAN}{docs_count}{END} documents found in {BOLD}{documents_path}{END}, using embeddings from {BOLD}{self.conf.vector.embeddings_path}{END}"
)
documents: List[Document] = []
# Loading all file types provided in the document_loaders object
for doc_load in self.document_loaders:
loader = DirectoryLoader(
documents_path,
glob=doc_load["glob"],
loader_cls=doc_load["loader_cls"],
loader_kwargs=doc_load["loader_kwargs"] if "loader_kwargs" in doc_load else {},
)
loaded_docs = loader.load()
if len(loaded_docs) > 0:
log.info(f"🗃️ Loaded {len(loaded_docs)} items from {doc_load['glob']} files")
documents.extend(loaded_docs)

# Split the text up into small, semantically meaningful chunks (often sentences) https://js.langchain.com/docs/modules/data_connection/document_transformers/
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.conf.vector.chunk_size, chunk_overlap=self.conf.vector.chunk_overlap
)
splitted_texts = text_splitter.split_documents(documents)
# TODO: use fastembed?
embeddings = HuggingFaceEmbeddings(
model_name=self.conf.vector.embeddings_path, model_kwargs={"device": self.device}
)
# TODO: use Qdrant vectorstore
# os.makedirs(str(self.conf.vector.vector_path), exist_ok=True)
# vectorstore = Qdrant.from_documents(
# splitted_texts,
# embeddings,
# # path=self.conf.vector.vector_path,
# host=self.conf.vector.vector_path,
# collection_name="libre_chat_rag",
# prefer_grpc=True,
# # force_recreate=True,
# )
vectorstore = FAISS.from_documents(splitted_texts, embeddings)
if self.vector_path:
vectorstore.save_local(self.vector_path)
log.info(f"✅ Vectorstore built in {datetime.now() - time_start}")
return vectorstore
return None

def query(
self,
prompt: str,
Expand Down Expand Up @@ -383,30 +310,3 @@ async def aquery(
Only return the helpful answer below and nothing else.
Helpful answer:
"""

DEFAULT_DOCUMENT_LOADERS: List[Dict[str, Union[str, Any]]] = [
{"glob": "*.pdf", "loader_cls": PyPDFLoader},
{"glob": "*.csv", "loader_cls": CSVLoader, "loader_kwargs": {"encoding": "utf8"}},
{
"glob": "*.tsv",
"loader_cls": CSVLoader,
"loader_kwargs": {"encoding": "utf8", "delimiter": "\t"},
},
{
"glob": "*.psv",
"loader_cls": CSVLoader,
"loader_kwargs": {"encoding": "utf8", "delimiter": "\\p"},
},
{"glob": "*.xls?x", "loader_cls": UnstructuredExcelLoader},
{"glob": "*.?xhtm?l", "loader_cls": UnstructuredHTMLLoader},
{"glob": "*.xml", "loader_cls": UnstructuredHTMLLoader},
{"glob": "*.json*", "loader_cls": JSONLoader},
{"glob": "*.md*", "loader_cls": UnstructuredMarkdownLoader},
{"glob": "*.txt", "loader_cls": TextLoader, "loader_kwargs": {"encoding": "utf8"}},
{"glob": "*.doc?x", "loader_cls": UnstructuredWordDocumentLoader},
{"glob": "*.odt", "loader_cls": UnstructuredODTLoader},
{"glob": "*.ppt?x", "loader_cls": UnstructuredPowerPointLoader},
{"glob": "*.epub", "loader_cls": UnstructuredEPubLoader},
{"glob": "*.eml", "loader_cls": UnstructuredEmailLoader},
{"glob": "*.enex", "loader_cls": EverNoteLoader},
]
115 changes: 115 additions & 0 deletions src/libre_chat/vectorstore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Module: Open-source LLM setup"""
import os
from datetime import datetime
from typing import Any, Dict, List, Optional, Union

from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
CSVLoader,
DirectoryLoader,
EverNoteLoader,
JSONLoader,
PyPDFLoader,
TextLoader,
UnstructuredEmailLoader,
UnstructuredEPubLoader,
UnstructuredExcelLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
)
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from libre_chat.conf import ChatConf
from libre_chat.utils import BOLD, CYAN, END, log


def build_vectorstore(
conf: ChatConf, document_loaders: Any, device: Any, vector_path: Optional[str] = None
) -> Optional[FAISS]:
"""Build vectorstore from documents."""
# NOTE: Using Qdrant blocked by UM proxy...
# https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/qdrant.py
time_start = datetime.now()
documents_path = conf.vector.documents_path
docs_count = len(os.listdir(documents_path))
if docs_count < 1:
log.warning(
f"⚠️ No documents found in {documents_path}, vectorstore will not be built, and a generic chatbot will be used until documents are added"
)
else:
log.info(
f"🏗️ Building the vectorstore from the {BOLD}{CYAN}{docs_count}{END} documents found in {BOLD}{documents_path}{END}, using embeddings from {BOLD}{conf.vector.embeddings_path}{END}"
)
documents: List[Document] = []
# Loading all file types provided in the document_loaders object
for doc_load in document_loaders:
loader = DirectoryLoader(
documents_path,
glob=doc_load["glob"],
loader_cls=doc_load["loader_cls"],
loader_kwargs=doc_load["loader_kwargs"] if "loader_kwargs" in doc_load else {},
)
loaded_docs = loader.load()
if len(loaded_docs) > 0:
log.info(f"🗃️ Loaded {len(loaded_docs)} items from {doc_load['glob']} files")
documents.extend(loaded_docs)

# Split the text up into small, semantically meaningful chunks (often sentences) https://js.langchain.com/docs/modules/data_connection/document_transformers/
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=conf.vector.chunk_size, chunk_overlap=conf.vector.chunk_overlap
)
splitted_texts = text_splitter.split_documents(documents)
# TODO: use fastembed?
embeddings = HuggingFaceEmbeddings(
model_name=conf.vector.embeddings_path, model_kwargs={"device": device}
)
# TODO: use Qdrant vectorstore
# os.makedirs(str(conf.vector.vector_path), exist_ok=True)
# vectorstore = Qdrant.from_documents(
# splitted_texts,
# embeddings,
# # path=conf.vector.vector_path,
# url=conf.vector.vector_path,
# collection_name="libre_chat_rag",
# prefer_grpc=True,
# # force_recreate=True,
# )
vectorstore = FAISS.from_documents(splitted_texts, embeddings)
if vector_path:
vectorstore.save_local(vector_path)
log.info(f"✅ Vectorstore built in {datetime.now() - time_start}")
return vectorstore
return None


DEFAULT_DOCUMENT_LOADERS: List[Dict[str, Union[str, Any]]] = [
{"glob": "*.pdf", "loader_cls": PyPDFLoader},
{"glob": "*.csv", "loader_cls": CSVLoader, "loader_kwargs": {"encoding": "utf8"}},
{
"glob": "*.tsv",
"loader_cls": CSVLoader,
"loader_kwargs": {"encoding": "utf8", "delimiter": "\t"},
},
{
"glob": "*.psv",
"loader_cls": CSVLoader,
"loader_kwargs": {"encoding": "utf8", "delimiter": "\\p"},
},
{"glob": "*.xls?x", "loader_cls": UnstructuredExcelLoader},
{"glob": "*.?xhtm?l", "loader_cls": UnstructuredHTMLLoader},
{"glob": "*.xml", "loader_cls": UnstructuredHTMLLoader},
{"glob": "*.json*", "loader_cls": JSONLoader},
{"glob": "*.md*", "loader_cls": UnstructuredMarkdownLoader},
{"glob": "*.txt", "loader_cls": TextLoader, "loader_kwargs": {"encoding": "utf8"}},
{"glob": "*.doc?x", "loader_cls": UnstructuredWordDocumentLoader},
{"glob": "*.odt", "loader_cls": UnstructuredODTLoader},
{"glob": "*.ppt?x", "loader_cls": UnstructuredPowerPointLoader},
{"glob": "*.epub", "loader_cls": UnstructuredEPubLoader},
{"glob": "*.eml", "loader_cls": UnstructuredEmailLoader},
{"glob": "*.enex", "loader_cls": EverNoteLoader},
]

0 comments on commit e55a50f

Please sign in to comment.