Merge pull request rh-aiservices-bu#161 from rh-aiservices-bu/feat/sa…

…nity_not_health_endpoint Improved response time test and updated 03/06 for VLLM
redhat-gpte-devopsautomation · Mar 20, 2024 · 5fe3c30 · 5fe3c30
2 parents 678f477 + 8169324
commit 5fe3c30
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 25 deletions.
diff --git a/lab-materials/03/06/llm_usage.py b/lab-materials/03/06/llm_usage.py
@@ -1,30 +1,28 @@
 import os
 
-from langchain.llms import HuggingFaceTextGenInference
+from langchain.llms import VLLMOpenAI
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 from langchain.evaluation import load_evaluator
 from langchain.embeddings import HuggingFaceEmbeddings
 
-INFERENCE_SERVER_URL = "http://llm.ic-shared-llm.svc.cluster.local:3000"
+INFERENCE_SERVER_URL = "http://vllm.llm-hosting.svc.cluster.local:8000"
 MAX_NEW_TOKENS = 512
-TOP_K = 10
 TOP_P = 0.95
-TYPICAL_P = 0.95
 TEMPERATURE = 0.01
-REPETITION_PENALTY = 1.03
+PRESENCE_PENALTY = 1.03
 
 def infer_with_template(input_text, template):
-    llm = HuggingFaceTextGenInference(
-        inference_server_url=INFERENCE_SERVER_URL,
-        max_new_tokens=MAX_NEW_TOKENS,
-        top_k=TOP_K,
+    llm = VLLMOpenAI(
+        openai_api_key="EMPTY",
+        openai_api_base= f"{INFERENCE_SERVER_URL}/v1",
+        model_name="mistralai/Mistral-7B-Instruct-v0.2",
+        max_tokens=MAX_NEW_TOKENS,
         top_p=TOP_P,
-        typical_p=TYPICAL_P,
         temperature=TEMPERATURE,
-        repetition_penalty=REPETITION_PENALTY,
-        streaming=True,
+        presence_penalty=PRESENCE_PENALTY,
+        streaming=False,
         verbose=False,
     )
 

diff --git a/lab-materials/03/06/requirements.txt b/lab-materials/03/06/requirements.txt
@@ -1,3 +1,4 @@
 langchain==0.0.340
 text_generation==0.6.1
-sentence_transformers==2.2.2
+sentence_transformers==2.2.2
+openai==1.13.3
diff --git a/lab-materials/03/06/test_response_quality.py b/lab-materials/03/06/test_response_quality.py
@@ -9,7 +9,7 @@ def test_response_quality():
     with open('summary_template.txt') as f:
         template = f.read()
 
-    expected_response = """A car insurance claim has been initiated by John Smith for a recent accident involving his Honda Accord and a Ford Escape. The accident occurred on October 15, 2023, at approximately 2:30 PM, at the intersection of Elm Street and Maple Avenue, near Smith Park, in Springfield, Illinois. The other party ran a red light and collided with the front passenger side of John's vehicle, causing significant damage to both vehicles. John sustained no serious injuries, but there were witnesses to the accident, and he has photos of the scene and the other party's insurance information. He is requesting that the insurance company initiate a claim under his policy for the damages to his vehicle and has provided the necessary documentation and information."""
+    expected_response = """On October 15, 2023, at around 2:30 PM, John Smith was involved in a car accident at the intersection of Elm Street and Maple Avenue in Springfield, Illinois (coordinates: 39.7476° N, 89.6960° W). He was driving his Honda Accord with a green light when a Ford Escape, which ran a red light, collided with the front passenger side of his vehicle. The accident occurred in overcast weather with light rain, and the road was wet. No serious injuries were reported, but both vehicles sustained significant damage. A police report was filed, and the officer's badge number is 12345. Witnesses to the accident include Sarah Johnson, Mark Williams, and Lisa Anderson, and their contact information has been provided. Photos of the accident scene, including the damage to both vehicles, traffic signals, and road conditions, have also been taken. John is requesting that a claim be initiated under his policy (ABC12345) for the damages to his vehicle and is asking for guidance on the claim process and required documentation."""
 
     response = infer_with_template(input_text, template)
     print(f"Response: {response}")

diff --git a/lab-materials/03/06/test_responsetime.py b/lab-materials/03/06/test_responsetime.py
@@ -1,20 +1,24 @@
-import os
+from llm_usage import infer_with_template
 import requests
 import json
+import time
 
-max_response_time = 0.5
+max_response_time = 3
 
 def send_request(endpoint):
     response = requests.get(endpoint)
     return response
 
-def test_responsetime(endpoint):
-    response = send_request(endpoint)
-
-    if response.status_code==200:
-        response_time = response.elapsed.total_seconds()
-    else:
-        raise Exception(f"Response status code is {response.status_code}")
+def test_responsetime():
+    TEMPLATE = """<s>[INST] <<SYS>>
+Answer below truthfully and in less than 10 words:
+<</SYS>>
+{silly_question}
+[/INST]"""
+
+    start = time.perf_counter()
+    response = infer_with_template("Who saw a saw saw a salsa?", TEMPLATE)
+    response_time = time.perf_counter() - start
 
     if response_time>max_response_time:
         raise Exception(f"Response took {response_time} which is greater than {max_response_time}")
@@ -27,5 +31,4 @@ def test_responsetime(endpoint):
         }, f)
 
 if __name__ == '__main__':
-    health_endpoint = "http://llm.ic-shared-llm.svc.cluster.local:3000" + "/health"
-    test_responsetime(health_endpoint)
+    test_responsetime()