Add HF embedings, add custom tuned prompts, add GPU accel (#45)

* update requirements.txt * remove state_of_the_union.txt update gitignore add multithreading * add poetry config * update streamlit version * update Dockerfile * update Dockerfile * fix Dockerfile * update Dockerfile * update README.md * update README.md * update convert.py & pyproject.toml * add tokenizer model * update README & lint * add pre-commit * run pre-commit * fix README.md * fix (?) convert.py * fix (?) convert.py * fix package versions * clean for merge * fix README.md * update README.md for new convert * redirect to main repo * fix ingest.py * rollback README.md * fix Dockerfile and README.md for streamlit * fix README.md * cleaner document handling in ingest.py remove CI * add support for ptt, docx * add sample documents clean gitignore bump package version * load env variables in centralized file load more variables from env lint * remove CI on merge * check for empty query * print embedding progress allow reusing collection add env USE_MLOCK * fix model_stop * fix model_stop * several minor improvements to startLLM.py fix empty MODEL_STOP add CHAIN_TYPE in env * pre-commit formatting * Add support for HuggingFace embeddings * - add custom prompt templates tailored for vic7b-5, and better than the default ones - update vic7b to 5.1 - add instructions for GPU support * update example.env * update prompts * fix typo * fix typo * update example.env * re-add strip * Add N_GPU_LAYERS to .env fix GPU instructions --------- Co-authored-by: su77ungr <[email protected]>
su77ungr · May 15, 2023 · e972eac · e972eac
1 parent 2264005
commit e972eac
Show file tree

Hide file tree

Showing 8 changed files with 837 additions and 225 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,8 @@ WORKDIR CASALIOY
 RUN pip3 install poetry
 RUN python3 -m poetry config virtualenvs.create false
 RUN python3 -m poetry install
-RUN python3 -m pip install --force streamlit  # Temp fix, see pyproject.toml
+RUN python3 -m pip install --force streamlit sentence_transformers # Temp fix, see pyproject.toml
+RUN python3 -m pip uninstall -y llama-cpp-python
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 -m pip install llama-cpp-python  # GPU support
 RUN pre-commit install
 COPY example.env .env
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ for older docker without GUI use `casalioy:latest` might deprecate soon
 ```
 cd models
 wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin &&
-wget https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin
+wget https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin
 cd ../
 ```
 
@@ -59,15 +59,21 @@ cd ../
 python -m pip install poetry
 python -m poetry config virtualenvs.in-project true
 python -m poetry install
-python -m pip install --force streamlit  # Temporary bandaid fix, waiting for streamlit >=1.23
 . .venv/bin/activate
+python -m pip install --force streamlit sentence_transformers  # Temporary bandaid fix, waiting for streamlit >=1.23
 pre-commit install
 ```
 
+If you want GPU support for llama-ccp:
+```shell
+pip uninstall -y llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python
+```
+
 > Download the 2 models and place them in a folder called `./models`:
 
 - LLM: default
-  is [ggjt-v1-vic7b-uncensored-q4_0](https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin)
+  is [ggml-vic7b-q5_1](https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin)
 - Embedding: default
   to [ggml-model-q4_0](https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin).
 
@@ -102,7 +108,7 @@ This should look like this
       │   └── shor.pdfstate_of_the_union.txt
       │   └── state_of_the_union.txt
       ├── models
-      │   ├── ggjt-v1-vic7b-uncensored-q4_0.bin
+      │   ├── ggml-vic7b-q5_1.bin
       │   └── ggml-model-q4_0.bin
       └── .env, convert.py, Dockerfile
 ```

diff --git a/example.env b/example.env
@@ -1,6 +1,7 @@
 # Generic
 MODEL_N_CTX=1024
-LLAMA_EMBEDDINGS_MODEL=models/ggml-model-q4_0.bin
+TEXT_EMBEDDINGS_MODEL=all-MiniLM-L6-v2
+TEXT_EMBEDDINGS_MODEL_TYPE=HF  # LlamaCpp or HF
 USE_MLOCK=true
 
 # Ingestion
@@ -11,6 +12,7 @@ INGEST_CHUNK_OVERLAP=50
 
 # Generation
 MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
-MODEL_PATH=models/ggjt-v1-vic7b-uncensored-q4_0.bin
+MODEL_PATH=models/ggml-vic7b-q5_1.bin
 MODEL_TEMP=0.8
-MODEL_STOP=###,\n
+MODEL_STOP=[STOP]
+CHAIN_TYPE=stuff
diff --git a/ingest.py b/ingest.py
@@ -4,6 +4,7 @@
 import sys
 from hashlib import md5
 from pathlib import Path
+from typing import Callable
 
 from langchain.docstore.document import Document
 from langchain.document_loaders import (
@@ -15,11 +16,10 @@
     UnstructuredHTMLLoader,
     UnstructuredPowerPointLoader,
 )
-from langchain.embeddings import LlamaCppEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from qdrant_client import QdrantClient, models
 
-from load_env import chunk_overlap, chunk_size, documents_directory, llama_embeddings_model, model_n_ctx, persist_directory, use_mlock
+from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory
 
 file_loaders = {  # extension -> loader
     "txt": lambda path: TextLoader(path, encoding="utf8"),
@@ -41,13 +41,13 @@ def load_one_doc(filepath: Path) -> list[Document]:
     return file_loaders[filepath.suffix[1:]](str(filepath)).load()
 
 
-def embed_documents_with_progress(embedding_model: LlamaCppEmbeddings, texts: list[str]) -> list[list[float]]:
+def embed_documents_with_progress(embedding_function: Callable, texts: list[str]) -> list[list[float]]:
     """wrapper around embed_documents that prints progress"""
     embeddings = []
     N_chunks = len(texts)
     for i, text in enumerate(texts):
-        print(f"embedding chunk {i+1}/{N_chunks}")
-        embeddings.append(embedding_model.client.embed(text))
+        print(f"embedding chunk {i + 1}/{N_chunks}")
+        embeddings.append(embedding_function(text))
 
     return [list(map(float, e)) for e in embeddings]
 
@@ -76,12 +76,12 @@ def main(sources_directory: str, cleandb: str) -> None:
 
     # Generate embeddings
     print("Generating embeddings...")
-    embedding_model = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx, use_mlock=use_mlock)
-    embeddings = embed_documents_with_progress(embedding_model, texts)
+    embedding_model, encode_fun = get_embedding_model()
+    embeddings = embed_documents_with_progress(encode_fun, texts)
 
     # Store embeddings
     print("Storing embeddings...")
-    client = QdrantClient(path=db_dir)  # using Qdrant.from_documents recreates the db each time
+    client = QdrantClient(path=db_dir, prefer_grpc=True)  # using Qdrant.from_documents recreates the db each time
     try:
         collection = client.get_collection("test")
     except ValueError:  # doesn't exist

diff --git a/load_env.py b/load_env.py
@@ -1,12 +1,16 @@
 """load env variables"""
 import os
+from typing import Callable
 
 from dotenv import load_dotenv
+from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
+from langchain.prompts import PromptTemplate
 
 load_dotenv()
 
 # generic
-llama_embeddings_model = os.environ.get("LLAMA_EMBEDDINGS_MODEL")
+text_embeddings_model = os.environ.get("TEXT_EMBEDDINGS_MODEL")
+text_embeddings_model_type = os.environ.get("TEXT_EMBEDDINGS_MODEL_TYPE")
 model_n_ctx = int(os.environ.get("MODEL_N_CTX"))
 use_mlock = os.environ.get("USE_MLOCK").lower() == "true"
 
@@ -19,5 +23,63 @@
 # generate
 model_type = os.environ.get("MODEL_TYPE")
 model_path = os.environ.get("MODEL_PATH")
-model_temp = float(os.environ.get("MODEL_TEMP"))
-model_stop = os.environ.get("MODEL_STOP").split(",")
+model_temp = float(os.environ.get("MODEL_TEMP", "0.8"))
+model_stop = os.environ.get("MODEL_STOP", "")
+model_stop = model_stop.split(",") if model_stop else []
+chain_type = os.environ.get("CHAIN_TYPE", "refine")
+n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", 0))
+
+
+def get_embedding_model() -> tuple[HuggingFaceEmbeddings, Callable] | tuple[LlamaCppEmbeddings, Callable]:
+    """get the text embedding model
+    :returns: tuple[the model, its encoding function]"""
+    match text_embeddings_model_type:
+        case "HF":
+            model = HuggingFaceEmbeddings(model_name=text_embeddings_model)
+            return model, model.client.encode
+        case "LlamaCpp":
+            model = LlamaCppEmbeddings(model_path=text_embeddings_model, n_ctx=model_n_ctx)
+            return model, model.client.embed
+        case _:
+            raise ValueError(f"Unknown embedding type {text_embeddings_model_type}")
+
+
+def get_prompt_template_kwargs() -> dict[str, PromptTemplate]:
+    """get an improved prompt template"""
+    match chain_type:
+        case "stuff":
+            question_prompt = """HUMAN: Answer the question using ONLY the given context. If you are unsure of the answer, respond with "Unknown[STOP]". Conclude your response with "[STOP]" to indicate the completion of the answer.
+
+Context: {context}
+
+Question: {question}
+
+ASSISTANT:"""
+            return {"prompt": PromptTemplate(template=question_prompt, input_variables=["context", "question"])}
+        case "refine":
+            question_prompt = """HUMAN: Answer the question using ONLY the given context.
+Indicate the end of your answer with "[STOP]" and refrain from adding any additional information beyond that which is provided in the context.
+
+Question: {question}
+
+Context: {context_str}
+
+ASSISTANT:"""
+            refine_prompt = """HUMAN: Refine the original answer to the question using the new context.
+Use ONLY the information from the context and your previous answer.
+If the context is not helpful, use the original answer.
+Indicate the end of your answer with "[STOP]" and avoid adding any extraneous information.
+
+Original question: {question}
+
+Existing answer: {existing_answer}
+
+New context: {context_str}
+
+ASSISTANT:"""
+            return {
+                "question_prompt": PromptTemplate(template=question_prompt, input_variables=["context_str", "question"]),
+                "refine_prompt": PromptTemplate(template=refine_prompt, input_variables=["context_str", "existing_answer", "question"]),
+            }
+        case _:
+            return {}