fuzzylabs · dudeperf3ct · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
         files: "\\.(py|txt|yaml|json|md|toml|lock|cfg|html|sh|js|yml)$"
       - id: end-of-file-fixer
       - id: check-added-large-files
-        args: ['--maxkb=1000']
+        args: ["--maxkb=1000"]
       - id: check-case-conflict
       - id: requirements-txt-fixer
 
@@ -44,3 +44,4 @@ repos:
     hooks:
       - id: typos
         args: [--config=_typos.toml]
+        pass_filenames: false
diff --git a/README.md b/README.md
@@ -230,6 +230,7 @@ curl localhost:5000/query_readability
 ```
 
 ### Monitoring MindGPT 👀
+
 We've created a [notebook](notebook/monitoring_notebook.ipynb) which accesses the monitoring service, fetches the metrics, and creates some simple plots showing the change over time.
 
 This is a starting point for accessing the metrics, and we're planning to introduce a hosted dashboard version of these plots at some point in the future.
@@ -279,6 +280,87 @@ kubectl get service streamlit-service -o jsonpath='{.status.loadBalancer.ingress
 
 If you visit that URL in browser, you should be able to interact with the deployed streamlit application.
 
+# Larger LLM models
+
+In this section, we'll show how to deploy a larger LLM model on AKS. We will use [OpenLLM](https://github.com/bentoml/OpenLLM) project to deploy a `flan-t5-xl` model. `xl` variant contains 3B parameters and weighs about 11 GB in size.
+
+### Deploying the model on AKS
+
+For this approach, we will deploy 2 VMs, one VM will act as CPU pool and other VM will act as GPU pool. GPU pool will be used to run the model.
+
+> Note: `Standard_NC4as_T4_v3` VM instance is not readily available. A quota request has to be submitted to Azure.
+
+1. Create a new AKS cluster with the system node CPU pool using `Standard_DS2_v3` VM instance.
+
+    ```bash
+    az aks create --resource-group <existing-resource-group> --name largellm --node-count 1 --node-vm-size Standard_DS2_v3 --generate-ssh-keys
+    ```
+
+2. Create a GPU Node pool using `Standard_NC4as_T4_v3` VM instance. This VM instance contains 4 vCPU, 28 GB RAM and T4 GPU with 16 GB VRAM.
+
+    ```bash
+    az extension add --name aks-preview
+    az extension update --name aks-preview
+    az feature register --namespace "Microsoft.ContainerService" --name "GPUDedicatedVHDPreview"
+    az provider register --namespace Microsoft.ContainerService
+    ```
+
+    ```bash
+    az aks nodepool add \
+    --resource-group <existing-resource-group> \
+    --cluster-name largellm \
+    --name gpunp \
+    --node-count 1 \
+    --node-vm-size Standard_NC4as_T4_v3 \
+    --node-taints sku=gpu:NoSchedule \
+    --aks-custom-headers UseGPUDedicatedVHD=true \
+    --enable-cluster-autoscaler \
+    --min-count 1 \
+    --max-count 1
+    ```
+
+    Verify both the nodes are provisioned.
+
+    ```bash
+    kubectl get nodes
+
+    Expected Output (the names of node might be different)
+    NAME                                STATUS   ROLES   AGE    VERSION
+    aks-gpunp-42873702-vmss000000       Ready    agent   5h5m   v1.26.6
+    aks-nodepool1-25311124-vmss000000   Ready    agent   5h9m   v1.26.6
+    ```
+
+3. Build and push docker image to ACR.
+    > Note: This creates a docker image of size **12-13 GB** that should be pushed to the ACR.
+
+    ```bash
+    az acr login --name <acr-registry-name>
+    docker build -t $acr_registry_uri/mindgpt/openllm:latest -f infrastructure/llm_k8s/Dockerfile .
+    docker push $acr_registry_uri/mindgpt/openllm:latest
+    ```
+
+4. Allow AKS to pull the image from ACR.
+
+    ```bash
+    az aks update --resource-group <existing-resource-group> --name largellm --attach-acr <acr-registry-name>
+    ```
+
+5. Apply the Kubernetes manifest to deploy the model on AKS.
+
+    ```bash
+    kubectl apply -f infrastructure/llm_k8s/openllm-deployment.yaml
+    ```
+
+6. Verify the model is deployed.
+
+    ```bash
+    kubectl get pods
+
+    Expected Output (the name of pod might be different)
+    NAME                                          READY   STATUS    RESTARTS   AGE
+    openllm-mindgpt-deployment-77985f86c9-4fj8b   1/1     Running   0          137m
+    ```
+
 # &#129309; Acknowledgements
 
 This project wouldn't be possible without the exceptional content on both the Mind and NHS Mental Health websites.

diff --git a/app/app_utils/llm.py b/app/app_utils/llm.py
@@ -1,15 +1,16 @@
 """Utility functions for interacting with the deployed LLM."""
 import json
 import logging
-from typing import Any, Dict, List, TypedDict
+import re
+from typing import Dict, List, TypedDict, Union
 
 import requests
 import streamlit as st
 from configs.prompt_template import DEFAULT_CONTEXT, PROMPT_TEMPLATES
 from configs.service_config import (
-    SELDON_NAMESPACE,
-    SELDON_PORT,
-    SELDON_SERVICE_NAME,
+    OPENLLM_NAMESPACE,
+    OPENLLM_PORT,
+    OPENLLM_SERVICE_NAME,
 )
 
 
@@ -35,7 +36,9 @@ def get_prediction_endpoint() -> str:
     Returns:
         str: the url endpoint if it exists and is valid, None otherwise.
     """
-    return f"http://{SELDON_SERVICE_NAME}.{SELDON_NAMESPACE}:{SELDON_PORT}/v2/models/transformer/infer"
+    return (
+        f"http://{OPENLLM_SERVICE_NAME}.{OPENLLM_NAMESPACE}:{OPENLLM_PORT}/v1/generate"
+    )
 
 
 def _build_conversation_history_template(history_list: List[Dict[str, str]]) -> str:
@@ -70,11 +73,27 @@ def build_memory_dict(question: str, response: str) -> Dict[str, str]:
     return {"user_input": question, "ai_response": response}
 
 
+def clean_fastchat_t5_output(answer: str) -> str:
+    """Clean the output from the fastchat-t5 model.
+
+    Args:
+        answer (str): Output response from model
+
+    Returns:
+        str: Cleaned response.
+    """
+    # Remove <pad> tags, double spaces, trailing newline
+    answer = re.sub(r"<pad>\s+", "", answer)
+    answer = re.sub(r"  ", " ", answer)
+    answer = re.sub(r"\n$", "", answer)
+    return answer
+
+
 def _create_payload(
     messages: MessagesType,
     temperature: float,
     max_length: int,
-) -> Dict[str, List[Dict[str, Any]]]:
+) -> Dict[str, Union[str, Dict[str, float]]]:
     """Create a payload from the user input to send to the LLM model.
 
     Args:
@@ -83,7 +102,7 @@ def _create_payload(
         max_length (int): max response length in tokens
 
     Returns:
-        Dict[str, List[Dict[str, Any]]]: the payload to send in the correct format.
+        Dict[str, Union[str, Dict[str, float]]]: the payload to send in the correct format.
     """
     context = messages.get("context", DEFAULT_CONTEXT)
     history: List[Dict[str, str]] = messages.get("history", [])
@@ -106,39 +125,19 @@ def _create_payload(
     logging.info(f"Prompt to LLM : {input_text}")
 
     return {
-        "inputs": [
-            {
-                "name": "array_inputs",
-                "shape": [-1],
-                "datatype": "string",
-                "data": str(input_text),
-            },
-            {
-                "name": "max_length",
-                "shape": [-1],
-                "datatype": "INT32",
-                "data": [max_length],
-                "parameters": {"content_type": "raw"},
-            },
-            {
-                "name": "temperature",
-                "shape": [-1],
-                "datatype": "INT32",
-                "data": [temperature],
-                "parameters": {"content_type": "raw"},
-            },
-        ]
+        "prompt": str(input_text),
+        "llm_config": {"temperature": temperature, "max_new_tokens": max_length},
     }
 
 
 def _get_predictions(
-    prediction_endpoint: str, payload: Dict[str, List[Dict[str, Any]]]
+    prediction_endpoint: str, payload: Dict[str, Union[str, Dict[str, float]]]
 ) -> str:
     """Using the prediction endpoint and payload, make a prediction request to the deployed model.
 
     Args:
         prediction_endpoint (str): the url endpoint.
-        payload (Dict[str, List[Dict[str, Any]]]): the payload to send to the model.
+        payload (Dict[str, Union[str, Dict[str, float]]]): the payload to send to the model.
 
     Returns:
         str: the predictions from the model.
@@ -148,9 +147,7 @@ def _get_predictions(
         data=json.dumps(payload),
         headers={"Content-Type": "application/json"},
     )
-    data = json.loads(json.loads(response.text)["outputs"][0]["data"][0])
-
-    return str(data["generated_text"])
+    return str(json.loads(response.text)["responses"][0])
 
 
 def query_llm(
@@ -172,7 +169,9 @@ def query_llm(
     """
     with st.spinner("Loading response..."):
         payload = _create_payload(messages, temperature, max_length)
-        logging.info(payload)
+        logging.info(f"Payload:\n{payload}")
         summary_txt = _get_predictions(prediction_endpoint, payload)
+        summary_txt = clean_fastchat_t5_output(summary_txt)
+        logging.info(f"LLM Response:\n{summary_txt}")
 
     return summary_txt
diff --git a/app/configs/service_config.py b/app/configs/service_config.py
@@ -11,10 +11,10 @@
 }
 COLLECTION_NAME_MAP = {"mind_data": "Mind", "nhs_data": "NHS"}
 
-# Seldon configuration
-SELDON_SERVICE_NAME = "llm-default-transformer"
-SELDON_NAMESPACE = "matcha-seldon-workloads"
-SELDON_PORT = 9000
+# OpenLLM configuration
+OPENLLM_SERVICE_NAME = "openllm-mindgpt-svc"
+OPENLLM_NAMESPACE = "default"
+OPENLLM_PORT = 3000
 
 # Metric service configuration
 METRIC_SERVICE_NAME = "monitoring-service"

diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: c76568f0dd939286c633b2ed3a557630
-  size: 2336306
+- md5: de6fed15842b8661c864c01a253d04a2
+  size: 1859742
   hash: md5
   path: mind_data_validated.csv
diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: c4b3c09b6499a4faeec99fd7dbc27d4f
-  size: 811904
+- md5: 1d5ebb8903a5470374ae9271ddbd9f59
+  size: 797289
   hash: md5
   path: nhs_data_validated.csv
diff --git a/infrastructure/llm_k8s/Dockerfile b/infrastructure/llm_k8s/Dockerfile
@@ -0,0 +1,11 @@
+FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
+
+ENV BENTOML_HOME="/model_store/"
+ENV CUDA_VISIBLE_DEVICES=0
+
+RUN apt-get update && apt-get install -y python3-pip
+RUN pip3 install "openllm[flan-t5]"
+EXPOSE 3000
+
+ENTRYPOINT [ "openllm", "start" ]
+CMD [ "flan-t5", "--model-id", "lmsys/fastchat-t5-3b-v1.0", "--device", "0", "-p", "3000", "--verbose"]
diff --git a/infrastructure/llm_k8s/openllm-deployment.yaml b/infrastructure/llm_k8s/openllm-deployment.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: openllm-mindgpt-deployment
+  labels:
+    app: openllm-mindgpt
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: openllm-mindgpt
+  template:
+    metadata:
+      labels:
+        app: openllm-mindgpt
+    spec:
+      containers:
+        - name: openllm-container
+          image: largellmcr.azurecr.io/mindgpt/openllm
+          imagePullPolicy: IfNotPresent
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+          ports:
+            - containerPort: 3000
+      restartPolicy: Always
+      tolerations:
+        - key: "sku"
+          operator: "Equal"
+          value: "gpu"
+          effect: "NoSchedule"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openllm-mindgpt-svc
+spec:
+  selector:
+    app: openllm-mindgpt
+  ports:
+    - protocol: TCP
+      port: 3000
+      targetPort: 3000