Skip to content

Commit

Permalink
Merge pull request #79 from alan-turing-institute/llama2-query
Browse files Browse the repository at this point in the history
Fix #78: Llama-2 query engine
  • Loading branch information
rwood-97 authored Sep 14, 2023
2 parents 2b9a5e2 + 389fbbf commit 49fef87
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 51 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The Reginald project consists of:
├── docker
│   └── Scripts for building a Docker image
├── models
│   └── REGinald models
│   └── REGinald models (in notebooks)
└── slack_bot
└── Python Slack bot
```
Expand Down
61 changes: 50 additions & 11 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ einops = { version="^0.6.1", optional=true }
faiss-cpu = { version="^1.7.4", optional=true }
gradio = { version="^3.34.0", optional=true }
langchain = "^0.0.278"
llama-index = "^0.8.24"
llama-cpp-python = "^0.1.83"
llama-hub = "^0.0.26"
llama-index = "^0.8.25"
llama-cpp-python = "^0.2.2"
llama-hub = "^0.0.30"
nbconvert = { version="^7.5.0", optional=true }
openai = { version="^0.27.8", optional=true }
pandas = "^2.0.2"
Expand Down
113 changes: 87 additions & 26 deletions slack_bot/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,54 +10,96 @@

from slack_bot import MODELS, Bot

DEFAULT_LLAMA_CPP_GGUF_MODEL = (
"https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve"
"/main/llama-2-13b-chat.Q6_K.gguf"
)
DEFAULT_HF_MODEL = "StabilityAI/stablelm-tuned-alpha-3b"


if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", "-m", help="Select which model to use", default=None, choices=MODELS
)
parser.add_argument(
"--hf_model",
"-hf",
help="""Select which HuggingFace model to use
(ignored if not using llama-huggingface model)""",
default="StabilityAI/stablelm-tuned-alpha-3b",
"--model-name",
"-n",
type=str,
help=(
"Select which LlamaCPP or HuggingFace model to use "
"(ignored if not using llama-index-llama-cpp or llama-index-hf). "
"Default model for llama-index-llama-cpp is downloaded from "
f"{DEFAULT_LLAMA_CPP_GGUF_MODEL}. "
"Default model for llama-index-hf is downloaded from "
f"{DEFAULT_HF_MODEL}."
),
default=None,
)
parser.add_argument(
"--path",
"-p",
help=(
"Whether or not the model_name passed is a path to the model "
"(ignored if not using llama-index-llama-cpp)"
),
action="store_true",
)
parser.add_argument(
"--max_input_size",
"--max-input-size",
"-max",
help="""Select maximum input size for HuggingFace model
(ignored if not using llama-huggingface model)""",
type=int,
help=(
"Select maximum input size for LlamaCPP or HuggingFace model "
"(ignored if not using llama-index-llama-cpp or llama-index-hf)"
),
default=4096,
)
parser.add_argument(
"--n-gpu-layers",
"-ngl",
type=int,
help=(
"Select number of GPU layers for LlamaCPP model "
"(ignored if not using llama-index-llama-cpp)"
),
default=0,
)
parser.add_argument(
"--device",
"-dev",
help="""Select device for HuggingFace model
(ignored if not using llama-huggingface model)""",
type=str,
help=(
"Select device for HuggingFace model "
"(ignored if not using llama-index-hf model)"
),
default="auto",
)
parser.add_argument(
"--force-new-index",
"-f",
help="Recreate the index vector store or not",
action=argparse.BooleanOptionalAction,
default=False,
action="store_true",
)
parser.add_argument(
"--data-dir",
"-d",
type=pathlib.Path,
help="Location for data",
default=(pathlib.Path(__file__).parent.parent / "data").resolve(),
)
parser.add_argument(
"--which-index",
"-w",
help="""Specifies the directory name for looking up/writing indices.
Currently supports 'all_data', 'public' and 'handbook'.
If regenerating index, 'all_data' will use all .txt .md. and .csv
files in the data directory, 'handbook' will
only use 'handbook.csv' file.""",
type=str,
help=(
"Specifies the directory name for looking up/writing indices. "
"Currently supports 'all_data', 'public' and 'handbook'. "
"If regenerating index, 'all_data' will use all .txt .md. and .csv "
"files in the data directory, 'handbook' will "
"only use 'handbook.csv' file."
),
default="all_data",
choices=["all_data", "public", "handbook"],
)
Expand Down Expand Up @@ -107,24 +149,43 @@
logging.error(f"Model {model_name} was not recognised")
sys.exit(1)

# Initialise LLM reponse model
logging.info(f"Initialising bot with model: {model_name}")

if model_name == "llama-index-hf":
response_model = model(
model_name=args.hf_model,
max_input_size=args.max_input_size,
device=args.device,
force_new_index=force_new_index,
data_dir=data_dir,
which_index=which_index,
)
# Set up any model args that are required
if model_name == "llama-index-llama-cpp":
if args.model_name is None:
args.model_name = DEFAULT_LLAMA_CPP_GGUF_MODEL

model_args = {
"model_name": args.model_name,
"path": args.path,
"n_gpu_layers": args.n_gpu_layers,
"max_input_size": args.max_input_size,
}
elif model_name == "llama-index-hf":
if args.model_name is None:
args.model_name = DEFAULT_HF_MODEL

model_args = {
"model_name": args.model_name,
"device": args.device,
"max_input_size": args.max_input_size,
}
else:
model_args = {}

if model_name == "hello":
response_model = model()
else:
response_model = model(
force_new_index=force_new_index,
data_dir=data_dir,
which_index=which_index,
**model_args,
)

# Initialise Bot with response model
logging.info(f"Initalising bot with model: {response_model}")

slack_bot = Bot(response_model)
Expand Down
8 changes: 7 additions & 1 deletion slack_bot/slack_bot/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
from .base import ResponseModel
from .chat_completion import ChatCompletionAzure, ChatCompletionOpenAI
from .hello import Hello
from .llama_index import LlamaIndexGPTAzure, LlamaIndexGPTOpenAI, LlamaIndexHF
from .llama_index import (
LlamaIndexGPTAzure,
LlamaIndexGPTOpenAI,
LlamaIndexHF,
LlamaIndexLlamaCPP,
)

# Please ensure that any models needing OPENAI_API_KEY are named *openai*
# Please ensure that any models needing OPENAI_AZURE_API_BASE and OPENAI_AZURE_API_KEY are named *azure*
MODELS = {
"chat-completion-azure": ChatCompletionAzure,
"chat-completion-openai": ChatCompletionOpenAI,
"hello": Hello,
"llama-index-llama-cpp": LlamaIndexLlamaCPP,
"llama-index-hf": LlamaIndexHF,
"llama-index-gpt-azure": LlamaIndexGPTAzure,
"llama-index-gpt-openai": LlamaIndexGPTOpenAI,
Expand Down
70 changes: 61 additions & 9 deletions slack_bot/slack_bot/models/llama_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
load_index_from_storage,
)
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.llms import AzureOpenAI, HuggingFaceLLM, OpenAI
from llama_index.llms import AzureOpenAI, HuggingFaceLLM, LlamaCPP, OpenAI
from llama_index.llms.base import LLM
from llama_index.llms.llama_utils import completion_to_prompt, messages_to_prompt
from llama_index.prompts import PromptTemplate
from llama_index.response.schema import RESPONSE_TYPE

Expand All @@ -36,7 +37,6 @@ def __init__(
max_input_size: int,
data_dir: pathlib.Path,
which_index: str,
device: str | None = None,
chunk_size: Optional[int] = None,
k: int = 3,
chunk_overlap_ratio: float = 0.1,
Expand All @@ -59,9 +59,6 @@ def __init__(
which_index : str
Which index to construct (if force_new_index is True) or use.
Options are "handbook", "public", or "all_data".
device : str, optional
Device to use for the LLM, by default None.
This is ignored if the LLM is model from OpenAI or Azure.
chunk_size : Optional[int], optional
Maximum size of chunks to use, by default None.
If None, this is computed as `ceil(max_input_size / k)`.
Expand All @@ -80,7 +77,6 @@ def __init__(
self.max_input_size = max_input_size
self.model_name = model_name
self.num_output = num_output
self.device = device
if chunk_size is None:
chunk_size = math.ceil(max_input_size / k)
self.chunk_size = chunk_size
Expand Down Expand Up @@ -332,10 +328,64 @@ def channel_mention(self, message: str, user_id: str) -> MessageResponse:
return MessageResponse(backend_response)


class LlamaIndexLlamaCPP(LlamaIndex):
def __init__(
self,
model_name: str,
path: bool,
n_gpu_layers: int = 0,
*args: Any,
**kwargs: Any,
) -> None:
"""
`LlamaIndexLlamaCPP` is a subclass of `LlamaIndex` that uses
llama-cpp to implement the LLM.
Parameters
----------
model_name : str
Either the path to the model or the URL to download the model from
path : bool, optional
If True, model_name is used as a path to the model file,
otherwise it should be the URL to download the model
n_gpu_layers : int, optional
Number of layers to offload to GPU.
If -1, all layers are offloaded, by default 0
"""
self.path = path
self.n_gpu_layers = n_gpu_layers
super().__init__(*args, model_name=model_name, **kwargs)

def _prep_llm(self) -> LLM:
logging.info(
f"Setting up LlamaCPP LLM (model {self.model_name}) on {self.n_gpu_layers} GPU layers"
)
logging.info(
f"LlamaCPP-args: (context_window: {self.max_input_size}, num_output: {self.num_output})"
)

return LlamaCPP(
model_url=self.model_name if not self.path else None,
model_path=self.model_name if self.path else None,
temperature=0.1,
max_new_tokens=self.num_output,
context_window=self.max_input_size,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
model_kwargs={"n_gpu_layers": self.n_gpu_layers},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)


class LlamaIndexHF(LlamaIndex):
def __init__(
self,
model_name: str = "StabilityAI/stablelm-tuned-alpha-3b",
device: str = "auto",
*args: Any,
**kwargs: Any,
) -> None:
Expand All @@ -348,13 +398,15 @@ def __init__(
model_name : str, optional
Model name from Huggingface's model hub,
by default "StabilityAI/stablelm-tuned-alpha-3b".
device : str, optional
Device map to use for the LLM, by default "auto".
"""
self.device = device
super().__init__(*args, model_name=model_name, **kwargs)

def _prep_llm(self) -> LLM:
dev = self.device or "auto"
logging.info(
f"Setting up Huggingface LLM (model {self.model_name}) on device {dev}"
f"Setting up Huggingface LLM (model {self.model_name}) on device {self.device}"
)
logging.info(
f"HF-args: (context_window: {self.max_input_size}, num_output: {self.num_output})"
Expand All @@ -365,7 +417,7 @@ def _prep_llm(self) -> LLM:
max_new_tokens=self.num_output,
# TODO: allow user to specify the query wrapper prompt for their model
query_wrapper_prompt=PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>"),
generate_kwargs={"temperature": 0.25, "do_sample": False},
generate_kwargs={"temperature": 0.1, "do_sample": False},
tokenizer_name=self.model_name,
model_name=self.model_name,
device_map=self.device or "auto",
Expand Down

0 comments on commit 49fef87

Please sign in to comment.