Merge pull request #66 from alan-turing-institute/llama2

Implement llama2 + llama_index
alan-turing-institute · Sep 8, 2023 · c160137 · c160137
2 parents 0e4661a + 17d7b13
commit c160137
Show file tree

Hide file tree

Showing 10 changed files with 9,283 additions and 3,444 deletions.
diff --git a/data/handbook_qa/data/qAndA.jsonl b/data/handbook_qa/data/qAndA.jsonl
diff --git a/models/llama-index-hack/llama2_ccp_chat.ipynb b/models/llama-index-hack/llama2_ccp_chat.ipynb
diff --git a/models/llama-index-hack/llama2_eval.ipynb b/models/llama-index-hack/llama2_eval.ipynb
diff --git a/models/llama-index-hack/llama2_github_collaborators.ipynb b/models/llama-index-hack/llama2_github_collaborators.ipynb
diff --git a/models/llama-index-hack/llama2_github_issues.ipynb b/models/llama-index-hack/llama2_github_issues.ipynb
diff --git a/models/llama-index-hack/llama2_qanda.ipynb b/models/llama-index-hack/llama2_qanda.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms import LlamaCPP\n",
+    "from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt\n",
+    "from llama_index import Document\n",
+    "from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage\n",
+    "from llama_index import LLMPredictor, PromptHelper, ServiceContext\n",
+    "\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama.cpp: loading model from ../../../llama/llama-2-7b-chat/ggml-model-q4_0.bin\n",
+      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
+      "llama_model_load_internal: n_vocab    = 32000\n",
+      "llama_model_load_internal: n_ctx      = 3900\n",
+      "llama_model_load_internal: n_embd     = 4096\n",
+      "llama_model_load_internal: n_mult     = 256\n",
+      "llama_model_load_internal: n_head     = 32\n",
+      "llama_model_load_internal: n_head_kv  = 32\n",
+      "llama_model_load_internal: n_layer    = 32\n",
+      "llama_model_load_internal: n_rot      = 128\n",
+      "llama_model_load_internal: n_gqa      = 1\n",
+      "llama_model_load_internal: rnorm_eps  = 1.0e-06\n",
+      "llama_model_load_internal: n_ff       = 11008\n",
+      "llama_model_load_internal: freq_base  = 10000.0\n",
+      "llama_model_load_internal: freq_scale = 1\n",
+      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
+      "llama_model_load_internal: model size = 7B\n",
+      "llama_model_load_internal: ggml ctx size =    0.08 MB\n",
+      "llama_model_load_internal: mem required  = 4160.96 MB (+ 1950.00 MB per state)\n",
+      "llama_new_context_with_model: kv self size  = 1950.00 MB\n",
+      "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = LlamaCPP(\n",
+    "    model_path=\"../../../llama/llama-2-7b-chat/gguf-model-q4_0.gguf\",\n",
+    "    temperature=0.1,\n",
+    "    max_new_tokens=256,\n",
+    "    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room\n",
+    "    context_window=3900,\n",
+    "    # kwargs to pass to __call__()\n",
+    "    generate_kwargs={},\n",
+    "    # kwargs to pass to __init__()\n",
+    "    # set to at least 1 to use GPU\n",
+    "    model_kwargs={\"n_gpu_layers\": 1},\n",
+    "    # transform inputs into Llama2 format\n",
+    "    messages_to_prompt=messages_to_prompt,\n",
+    "    completion_to_prompt=completion_to_prompt,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_documents():\n",
+    "    df = pd.read_json(\"../../data/handbook_qa/data/qAndA.jsonl\", lines=True)\n",
+    "    df[\"prompt\"]=df[\"prompt\"]+\"?\"\n",
+    "    documents = [Document(text=str(i)) for i in df.values]\n",
+    "\n",
+    "    return documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = load_documents()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[\\'What are some of the shortcodes in the handbook theme ?\\'\\n \"Some of the shortcodes in the handbook theme include figure, gist, highlight, and KaTeX, as well as custom ones specific to the theme such as hints, expand, and tabs. Shortcodes are templates that can be parametrized and can be included in the content section of a Markdown file to enable more complex features than Markdown\\'s simple syntax allows.\"]'"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[10].text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_service_context(\n",
+    "        model, \n",
+    "        max_input_size=1024,\n",
+    "        num_output=128,\n",
+    "        chunk_size_lim=256,\n",
+    "        overlap_ratio=0.1\n",
+    "    ):\n",
+    "    llm_predictor=LLMPredictor(llm=model)\n",
+    "    prompt_helper=PromptHelper(max_input_size,num_output,overlap_ratio,chunk_size_lim)\n",
+    "    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=\"local\")\n",
+    "    return service_context"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/rwood/Library/Caches/pypoetry/virtualenvs/reginald-slack-ZVq5BSHv-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "service_context = create_service_context(llm)\n",
+    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_engine = index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Based on the provided context information, I understand that REG refers to the Research and Engineering Group (REG) within the Turing Institute.\n",
+      "According to the context, the REG newsletter is sent out monthly around a week before the monthly team meeting, and the all-REG meeting is also held monthly where new joiners are welcomed and updates from around REG or the Turing are presented, followed by a discussion on a topic of interest for the wider team.\n",
+      "Therefore, based on the information provided, REG appears to be a group within the Turing Institute that focuses on research and engineering activities, and has regular meetings to share updates and discuss topics of interest.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  5081.84 ms\n",
+      "llama_print_timings:      sample time =   104.11 ms /   148 runs   (    0.70 ms per token,  1421.53 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  5081.79 ms /   238 tokens (   21.35 ms per token,    46.83 tokens per second)\n",
+      "llama_print_timings:        eval time =  7902.76 ms /   147 runs   (   53.76 ms per token,    18.60 tokens per second)\n",
+      "llama_print_timings:       total time = 13274.83 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = query_engine.query(\"Who are REG\")\n",
+    "print(response.response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['What is the REG newsletter and how often is it sent out?'\n",
+      " 'The REG newsletter is a monthly email containing short project updates and other news/updates from around the team and the institute. It is sent around a week before the monthly team meeting and comes to the Hut23 mailing list.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(response.source_nodes[0].node.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Thank you for asking! The Alan Turing Institute is based in London, UK.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  5081.84 ms\n",
+      "llama_print_timings:      sample time =    13.43 ms /    19 runs   (    0.71 ms per token,  1415.27 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  4391.42 ms /   212 tokens (   20.71 ms per token,    48.28 tokens per second)\n",
+      "llama_print_timings:        eval time =   957.65 ms /    18 runs   (   53.20 ms per token,    18.80 tokens per second)\n",
+      "llama_print_timings:       total time =  5386.18 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = query_engine.query(\"Where is the Alan Turing Institute based?\")\n",
+    "print(response.response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llama",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}