Merge pull request #79 from alan-turing-institute/llama2-query

Fix #78: Llama-2 query engine
alan-turing-institute · Sep 14, 2023 · 49fef87 · 49fef87
2 parents 2b9a5e2 + 389fbbf
commit 49fef87
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ The Reginald project consists of:
 ├── docker
 │   └── Scripts for building a Docker image
 ├── models
-│   └── REGinald models
+│   └── REGinald models (in notebooks)
 └── slack_bot
     └── Python Slack bot
 ```

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,9 +22,9 @@ einops = { version="^0.6.1", optional=true }
 faiss-cpu = { version="^1.7.4", optional=true }
 gradio = { version="^3.34.0", optional=true }
 langchain = "^0.0.278"
-llama-index = "^0.8.24"
-llama-cpp-python = "^0.1.83"
-llama-hub = "^0.0.26"
+llama-index = "^0.8.25"
+llama-cpp-python = "^0.2.2"
+llama-hub = "^0.0.30"
 nbconvert = { version="^7.5.0", optional=true }
 openai = { version="^0.27.8", optional=true }
 pandas = "^2.0.2"

diff --git a/slack_bot/run.py b/slack_bot/run.py
@@ -10,54 +10,96 @@
 
 from slack_bot import MODELS, Bot
 
+DEFAULT_LLAMA_CPP_GGUF_MODEL = (
+    "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve"
+    "/main/llama-2-13b-chat.Q6_K.gguf"
+)
+DEFAULT_HF_MODEL = "StabilityAI/stablelm-tuned-alpha-3b"
+
+
 if __name__ == "__main__":
     # Parse command line arguments
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model", "-m", help="Select which model to use", default=None, choices=MODELS
     )
     parser.add_argument(
-        "--hf_model",
-        "-hf",
-        help="""Select which HuggingFace model to use
-        (ignored if not using llama-huggingface model)""",
-        default="StabilityAI/stablelm-tuned-alpha-3b",
+        "--model-name",
+        "-n",
+        type=str,
+        help=(
+            "Select which LlamaCPP or HuggingFace model to use "
+            "(ignored if not using llama-index-llama-cpp or llama-index-hf). "
+            "Default model for llama-index-llama-cpp is downloaded from "
+            f"{DEFAULT_LLAMA_CPP_GGUF_MODEL}. "
+            "Default model for llama-index-hf is downloaded from "
+            f"{DEFAULT_HF_MODEL}."
+        ),
+        default=None,
+    )
+    parser.add_argument(
+        "--path",
+        "-p",
+        help=(
+            "Whether or not the model_name passed is a path to the model "
+            "(ignored if not using llama-index-llama-cpp)"
+        ),
+        action="store_true",
     )
     parser.add_argument(
-        "--max_input_size",
+        "--max-input-size",
         "-max",
-        help="""Select maximum input size for HuggingFace model
-        (ignored if not using llama-huggingface model)""",
+        type=int,
+        help=(
+            "Select maximum input size for LlamaCPP or HuggingFace model "
+            "(ignored if not using llama-index-llama-cpp or llama-index-hf)"
+        ),
         default=4096,
     )
+    parser.add_argument(
+        "--n-gpu-layers",
+        "-ngl",
+        type=int,
+        help=(
+            "Select number of GPU layers for LlamaCPP model "
+            "(ignored if not using llama-index-llama-cpp)"
+        ),
+        default=0,
+    )
     parser.add_argument(
         "--device",
         "-dev",
-        help="""Select device for HuggingFace model
-        (ignored if not using llama-huggingface model)""",
+        type=str,
+        help=(
+            "Select device for HuggingFace model "
+            "(ignored if not using llama-index-hf model)"
+        ),
         default="auto",
     )
     parser.add_argument(
         "--force-new-index",
         "-f",
         help="Recreate the index vector store or not",
-        action=argparse.BooleanOptionalAction,
-        default=False,
+        action="store_true",
     )
     parser.add_argument(
         "--data-dir",
         "-d",
+        type=pathlib.Path,
         help="Location for data",
         default=(pathlib.Path(__file__).parent.parent / "data").resolve(),
     )
     parser.add_argument(
         "--which-index",
         "-w",
-        help="""Specifies the directory name for looking up/writing indices.
-        Currently supports 'all_data', 'public' and 'handbook'.
-        If regenerating index, 'all_data' will use all .txt .md. and .csv
-        files in the data directory, 'handbook' will
-        only use 'handbook.csv' file.""",
+        type=str,
+        help=(
+            "Specifies the directory name for looking up/writing indices. "
+            "Currently supports 'all_data', 'public' and 'handbook'. "
+            "If regenerating index, 'all_data' will use all .txt .md. and .csv "
+            "files in the data directory, 'handbook' will "
+            "only use 'handbook.csv' file."
+        ),
         default="all_data",
         choices=["all_data", "public", "handbook"],
     )
@@ -107,24 +149,43 @@
         logging.error(f"Model {model_name} was not recognised")
         sys.exit(1)
 
+    # Initialise LLM reponse model
     logging.info(f"Initialising bot with model: {model_name}")
 
-    if model_name == "llama-index-hf":
-        response_model = model(
-            model_name=args.hf_model,
-            max_input_size=args.max_input_size,
-            device=args.device,
-            force_new_index=force_new_index,
-            data_dir=data_dir,
-            which_index=which_index,
-        )
+    # Set up any model args that are required
+    if model_name == "llama-index-llama-cpp":
+        if args.model_name is None:
+            args.model_name = DEFAULT_LLAMA_CPP_GGUF_MODEL
+
+        model_args = {
+            "model_name": args.model_name,
+            "path": args.path,
+            "n_gpu_layers": args.n_gpu_layers,
+            "max_input_size": args.max_input_size,
+        }
+    elif model_name == "llama-index-hf":
+        if args.model_name is None:
+            args.model_name = DEFAULT_HF_MODEL
+
+        model_args = {
+            "model_name": args.model_name,
+            "device": args.device,
+            "max_input_size": args.max_input_size,
+        }
+    else:
+        model_args = {}
+
+    if model_name == "hello":
+        response_model = model()
     else:
         response_model = model(
             force_new_index=force_new_index,
             data_dir=data_dir,
             which_index=which_index,
+            **model_args,
         )
 
+    # Initialise Bot with response model
     logging.info(f"Initalising bot with model: {response_model}")
 
     slack_bot = Bot(response_model)

diff --git a/slack_bot/slack_bot/models/__init__.py b/slack_bot/slack_bot/models/__init__.py
@@ -1,14 +1,20 @@
 from .base import ResponseModel
 from .chat_completion import ChatCompletionAzure, ChatCompletionOpenAI
 from .hello import Hello
-from .llama_index import LlamaIndexGPTAzure, LlamaIndexGPTOpenAI, LlamaIndexHF
+from .llama_index import (
+    LlamaIndexGPTAzure,
+    LlamaIndexGPTOpenAI,
+    LlamaIndexHF,
+    LlamaIndexLlamaCPP,
+)
 
 # Please ensure that any models needing OPENAI_API_KEY are named *openai*
 # Please ensure that any models needing OPENAI_AZURE_API_BASE and OPENAI_AZURE_API_KEY are named *azure*
 MODELS = {
     "chat-completion-azure": ChatCompletionAzure,
     "chat-completion-openai": ChatCompletionOpenAI,
     "hello": Hello,
+    "llama-index-llama-cpp": LlamaIndexLlamaCPP,
     "llama-index-hf": LlamaIndexHF,
     "llama-index-gpt-azure": LlamaIndexGPTAzure,
     "llama-index-gpt-openai": LlamaIndexGPTOpenAI,

diff --git a/slack_bot/slack_bot/models/llama_index.py b/slack_bot/slack_bot/models/llama_index.py
@@ -17,8 +17,9 @@
     load_index_from_storage,
 )
 from llama_index.indices.vector_store.base import VectorStoreIndex
-from llama_index.llms import AzureOpenAI, HuggingFaceLLM, OpenAI
+from llama_index.llms import AzureOpenAI, HuggingFaceLLM, LlamaCPP, OpenAI
 from llama_index.llms.base import LLM
+from llama_index.llms.llama_utils import completion_to_prompt, messages_to_prompt
 from llama_index.prompts import PromptTemplate
 from llama_index.response.schema import RESPONSE_TYPE
 
@@ -36,7 +37,6 @@ def __init__(
         max_input_size: int,
         data_dir: pathlib.Path,
         which_index: str,
-        device: str | None = None,
         chunk_size: Optional[int] = None,
         k: int = 3,
         chunk_overlap_ratio: float = 0.1,
@@ -59,9 +59,6 @@ def __init__(
         which_index : str
             Which index to construct (if force_new_index is True) or use.
             Options are "handbook", "public", or "all_data".
-        device : str, optional
-            Device to use for the LLM, by default None.
-            This is ignored if the LLM is model from OpenAI or Azure.
         chunk_size : Optional[int], optional
             Maximum size of chunks to use, by default None.
             If None, this is computed as `ceil(max_input_size / k)`.
@@ -80,7 +77,6 @@ def __init__(
         self.max_input_size = max_input_size
         self.model_name = model_name
         self.num_output = num_output
-        self.device = device
         if chunk_size is None:
             chunk_size = math.ceil(max_input_size / k)
         self.chunk_size = chunk_size
@@ -332,10 +328,64 @@ def channel_mention(self, message: str, user_id: str) -> MessageResponse:
         return MessageResponse(backend_response)
 
 
+class LlamaIndexLlamaCPP(LlamaIndex):
+    def __init__(
+        self,
+        model_name: str,
+        path: bool,
+        n_gpu_layers: int = 0,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        `LlamaIndexLlamaCPP` is a subclass of `LlamaIndex` that uses
+        llama-cpp to implement the LLM.
+
+        Parameters
+        ----------
+        model_name : str
+            Either the path to the model or the URL to download the model from
+        path : bool, optional
+            If True, model_name is used as a path to the model file,
+            otherwise it should be the URL to download the model
+        n_gpu_layers : int, optional
+            Number of layers to offload to GPU.
+            If -1, all layers are offloaded, by default 0
+        """
+        self.path = path
+        self.n_gpu_layers = n_gpu_layers
+        super().__init__(*args, model_name=model_name, **kwargs)
+
+    def _prep_llm(self) -> LLM:
+        logging.info(
+            f"Setting up LlamaCPP LLM (model {self.model_name}) on {self.n_gpu_layers} GPU layers"
+        )
+        logging.info(
+            f"LlamaCPP-args: (context_window: {self.max_input_size}, num_output: {self.num_output})"
+        )
+
+        return LlamaCPP(
+            model_url=self.model_name if not self.path else None,
+            model_path=self.model_name if self.path else None,
+            temperature=0.1,
+            max_new_tokens=self.num_output,
+            context_window=self.max_input_size,
+            # kwargs to pass to __call__()
+            generate_kwargs={},
+            # kwargs to pass to __init__()
+            model_kwargs={"n_gpu_layers": self.n_gpu_layers},
+            # transform inputs into Llama2 format
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            verbose=True,
+        )
+
+
 class LlamaIndexHF(LlamaIndex):
     def __init__(
         self,
         model_name: str = "StabilityAI/stablelm-tuned-alpha-3b",
+        device: str = "auto",
         *args: Any,
         **kwargs: Any,
     ) -> None:
@@ -348,13 +398,15 @@ def __init__(
         model_name : str, optional
             Model name from Huggingface's model hub,
             by default "StabilityAI/stablelm-tuned-alpha-3b".
+        device : str, optional
+            Device map to use for the LLM, by default "auto".
         """
+        self.device = device
         super().__init__(*args, model_name=model_name, **kwargs)
 
     def _prep_llm(self) -> LLM:
-        dev = self.device or "auto"
         logging.info(
-            f"Setting up Huggingface LLM (model {self.model_name}) on device {dev}"
+            f"Setting up Huggingface LLM (model {self.model_name}) on device {self.device}"
         )
         logging.info(
             f"HF-args: (context_window: {self.max_input_size}, num_output: {self.num_output})"
@@ -365,7 +417,7 @@ def _prep_llm(self) -> LLM:
             max_new_tokens=self.num_output,
             # TODO: allow user to specify the query wrapper prompt for their model
             query_wrapper_prompt=PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>"),
-            generate_kwargs={"temperature": 0.25, "do_sample": False},
+            generate_kwargs={"temperature": 0.1, "do_sample": False},
             tokenizer_name=self.model_name,
             model_name=self.model_name,
             device_map=self.device or "auto",