Skip to content

Commit

Permalink
Merge pull request rh-aiservices-bu#161 from rh-aiservices-bu/feat/sa…
Browse files Browse the repository at this point in the history
…nity_not_health_endpoint

Improved response time test and updated 03/06 for VLLM
  • Loading branch information
guimou authored Mar 20, 2024
2 parents 678f477 + 8169324 commit 5fe3c30
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 25 deletions.
22 changes: 10 additions & 12 deletions lab-materials/03/06/llm_usage.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,28 @@
import os

from langchain.llms import HuggingFaceTextGenInference
from langchain.llms import VLLMOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.evaluation import load_evaluator
from langchain.embeddings import HuggingFaceEmbeddings

INFERENCE_SERVER_URL = "http://llm.ic-shared-llm.svc.cluster.local:3000"
INFERENCE_SERVER_URL = "http://vllm.llm-hosting.svc.cluster.local:8000"
MAX_NEW_TOKENS = 512
TOP_K = 10
TOP_P = 0.95
TYPICAL_P = 0.95
TEMPERATURE = 0.01
REPETITION_PENALTY = 1.03
PRESENCE_PENALTY = 1.03

def infer_with_template(input_text, template):
llm = HuggingFaceTextGenInference(
inference_server_url=INFERENCE_SERVER_URL,
max_new_tokens=MAX_NEW_TOKENS,
top_k=TOP_K,
llm = VLLMOpenAI(
openai_api_key="EMPTY",
openai_api_base= f"{INFERENCE_SERVER_URL}/v1",
model_name="mistralai/Mistral-7B-Instruct-v0.2",
max_tokens=MAX_NEW_TOKENS,
top_p=TOP_P,
typical_p=TYPICAL_P,
temperature=TEMPERATURE,
repetition_penalty=REPETITION_PENALTY,
streaming=True,
presence_penalty=PRESENCE_PENALTY,
streaming=False,
verbose=False,
)

Expand Down
3 changes: 2 additions & 1 deletion lab-materials/03/06/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
langchain==0.0.340
text_generation==0.6.1
sentence_transformers==2.2.2
sentence_transformers==2.2.2
openai==1.13.3
2 changes: 1 addition & 1 deletion lab-materials/03/06/test_response_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def test_response_quality():
with open('summary_template.txt') as f:
template = f.read()

expected_response = """A car insurance claim has been initiated by John Smith for a recent accident involving his Honda Accord and a Ford Escape. The accident occurred on October 15, 2023, at approximately 2:30 PM, at the intersection of Elm Street and Maple Avenue, near Smith Park, in Springfield, Illinois. The other party ran a red light and collided with the front passenger side of John's vehicle, causing significant damage to both vehicles. John sustained no serious injuries, but there were witnesses to the accident, and he has photos of the scene and the other party's insurance information. He is requesting that the insurance company initiate a claim under his policy for the damages to his vehicle and has provided the necessary documentation and information."""
expected_response = """On October 15, 2023, at around 2:30 PM, John Smith was involved in a car accident at the intersection of Elm Street and Maple Avenue in Springfield, Illinois (coordinates: 39.7476° N, 89.6960° W). He was driving his Honda Accord with a green light when a Ford Escape, which ran a red light, collided with the front passenger side of his vehicle. The accident occurred in overcast weather with light rain, and the road was wet. No serious injuries were reported, but both vehicles sustained significant damage. A police report was filed, and the officer's badge number is 12345. Witnesses to the accident include Sarah Johnson, Mark Williams, and Lisa Anderson, and their contact information has been provided. Photos of the accident scene, including the damage to both vehicles, traffic signals, and road conditions, have also been taken. John is requesting that a claim be initiated under his policy (ABC12345) for the damages to his vehicle and is asking for guidance on the claim process and required documentation."""

response = infer_with_template(input_text, template)
print(f"Response: {response}")
Expand Down
25 changes: 14 additions & 11 deletions lab-materials/03/06/test_responsetime.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
import os
from llm_usage import infer_with_template
import requests
import json
import time

max_response_time = 0.5
max_response_time = 3

def send_request(endpoint):
response = requests.get(endpoint)
return response

def test_responsetime(endpoint):
response = send_request(endpoint)

if response.status_code==200:
response_time = response.elapsed.total_seconds()
else:
raise Exception(f"Response status code is {response.status_code}")
def test_responsetime():
TEMPLATE = """<s>[INST] <<SYS>>
Answer below truthfully and in less than 10 words:
<</SYS>>
{silly_question}
[/INST]"""

start = time.perf_counter()
response = infer_with_template("Who saw a saw saw a salsa?", TEMPLATE)
response_time = time.perf_counter() - start

if response_time>max_response_time:
raise Exception(f"Response took {response_time} which is greater than {max_response_time}")
Expand All @@ -27,5 +31,4 @@ def test_responsetime(endpoint):
}, f)

if __name__ == '__main__':
health_endpoint = "http://llm.ic-shared-llm.svc.cluster.local:3000" + "/health"
test_responsetime(health_endpoint)
test_responsetime()

0 comments on commit 5fe3c30

Please sign in to comment.