Skip to content
This repository has been archived by the owner on Jun 25, 2023. It is now read-only.

jetjodh solution w/ fastapi #42

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions autotests/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ global:
activeDeadlineSeconds: 3600 # 1h

env:
PARTICIPANT_NAME: <REPLACE_WITH_USERNAME>
api_host: http://inca-smc-mlops-challenge-solution.default.svc.cluster.local/<REPLACE_WITH_ENDPOINT>
PARTICIPANT_NAME: jetjodh
api_host: http://inca-smc-mlops-challenge-solution.default.svc.cluster.local/process

# K6, do not edit!
K6_PROMETHEUS_RW_SERVER_URL: http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write
Expand Down
24 changes: 24 additions & 0 deletions solution/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Base image
FROM python:3.9

# Set working directory
WORKDIR /app

# Copy project files into the Docker image
COPY requirements.txt /app
COPY main.py /app
COPY models.py /app
RUN ls

# Install dependencies
RUN pip install -r requirements.txt

# Set environment variables
ENV FASTAPI_APP=main.py

# Expose port
EXPOSE 8000

# RUN python3 models.py
# Command to run the Flask application
CMD ["python3", "models.py", "&&","uvicorn", "main:app", "--reload"]
22 changes: 22 additions & 0 deletions solution/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
version: "3.9"

services:
web:
build: .
image: app
ports:
- "8000:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]

networks:
default:
driver: bridge
ipam:
config:
- subnet: 172.16.57.0/24
164 changes: 164 additions & 0 deletions solution/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import asyncio
import time
import numpy as np
from transformers import AutoConfig , AutoTokenizer
from onnxruntime.transformers.io_binding_helper import IOBindingHelper
from onnxruntime import InferenceSession, GraphOptimizationLevel
import torch
from fastapi import FastAPI, Request
from cachetools import TTLCache

app = FastAPI()

# Load the NLP models
models = {
"cardiffnlp": {
"model_name": "cardiffnlp/twitter-xlm-roberta-base-sentiment",
"tokenizer": AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment"),
"model": InferenceSession("models/optimized_cardiffnlp/model.onnx",providers=['TensorrtExecutionProvider', "CUDAExecutionProvider"]),
"id2label": AutoConfig.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment").id2label
},
"ivanlau": {
"model_name": "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base",
"tokenizer": AutoTokenizer.from_pretrained("ivanlau/language-detection-fine-tuned-on-xlm-roberta-base"),
"model": InferenceSession("models/optimized_ivanlau/model.onnx",providers=['TensorrtExecutionProvider', "CUDAExecutionProvider"]),
"id2label": AutoConfig.from_pretrained("ivanlau/language-detection-fine-tuned-on-xlm-roberta-base").id2label
},
"svalabs": {
"model_name": "svalabs/twitter-xlm-roberta-crypto-spam",
"tokenizer": AutoTokenizer.from_pretrained("svalabs/twitter-xlm-roberta-crypto-spam"),
"model": InferenceSession("models/optimized_svalabs/model.onnx",providers=['TensorrtExecutionProvider', "CUDAExecutionProvider"]),
"id2label": AutoConfig.from_pretrained("svalabs/twitter-xlm-roberta-crypto-spam").id2label
},
"EIStakovskii": {
"model_name": "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus",
"tokenizer": AutoTokenizer.from_pretrained("EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"),
"model": InferenceSession("models/optimized_EIStakovskii/model.onnx",providers=['TensorrtExecutionProvider', "CUDAExecutionProvider"]),
"id2label": AutoConfig.from_pretrained("EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus").id2label
},
"jy46604790": {
"model_name": "jy46604790/Fake-News-Bert-Detect",
"tokenizer": AutoTokenizer.from_pretrained("jy46604790/Fake-News-Bert-Detect"),
"model": InferenceSession("models/optimized_jy46604790/model.onnx",providers=['TensorrtExecutionProvider', "CUDAExecutionProvider"]),
"id2label": AutoConfig.from_pretrained("jy46604790/Fake-News-Bert-Detect").id2label
},
}


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def prepare_io_binding(
ort_session, input_dict, output_buffers, output_shapes
):
"""
Prepare the input/output binding for the provided ONNX Runtime session.

:param ort_session: The ONNX Runtime session to bind inputs/outputs
:type ort_session: onnxruntime.InferenceSession

:param input_dict: A dictionary containing the input names and values
:type input_dict: dict[str, numpy.ndarray]

:param output_buffers: A dictionary containing the output buffers to store the output values
:type output_buffers: dict[str, numpy.ndarray]

:param output_shapes: A dictionary containing the output shapes
:type output_shapes: dict[str, List[int]]

:return: The IO binding for the provided ONNX Runtime session
:rtype: onnxruntime.OrtDevice
"""
ort_session.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
io_binding = ort_session.io_binding()

# Bind inputs
for name, input_val in input_dict.items():
if input_val is not None:
input_val = torch.from_numpy(input_val)
io_binding.bind_input(
name,
input_val.device.type,
0,
np.int64,
input_val.size(),
input_val.data_ptr(),
)

# Bind outputs
for output in ort_session.get_outputs():
output_name = output.name
output_buffer = output_buffers[output_name]
io_binding.bind_output(
output_name,
"cuda",
0,
np.float32,
output_shapes[output_name],
output_buffer.data_ptr(),
)

return io_binding

async def model_inference(model, labels, inputs):
"""
Asynchronously performs inference on a PyTorch model using the provided inputs.

:param model: The PyTorch model to perform inference on.
:type model: torch.nn.Module
:param labels: A list of labels corresponding to the model's output classes.
:type labels: List[str]
:param inputs: The inputs to be passed to the model for inference.
:type inputs: torch.Tensor
:return: A dictionary containing the highest scoring label and its corresponding score.
:rtype: Dict[str, Union[str, float]]
"""
output_buffers = {
"logits": torch.empty(
(model.get_outputs()[0].shape[1],), dtype=torch.float32, device="cuda"
),
}
output_shapes = {
"logits": [1, model.get_outputs()[0].shape[1]],
}
io_binding = prepare_io_binding(
model,
inputs,
output_buffers,
output_shapes,
)

model.run_with_iobinding(io_binding)
outputs = IOBindingHelper.get_outputs_from_io_binding_buffer(
model, output_buffers=output_buffers, output_shapes=output_shapes
)
outputs = torch.from_numpy(outputs[0])
scores = torch.nn.functional.softmax(outputs)[0]
max_i = scores.argmax().item()
return {"score": scores[max_i].item(), "label": labels[max_i]}


@app.post("/process")
async def process(request: Request):
# initiate timer
start_time = time.time()
text = (await request.body()).decode("utf-8")
results = TTLCache(maxsize=5000, ttl=300)
tasks = []

for model_name, model_data in models.items():
tokenizer = model_data["tokenizer"]
model = model_data["model"]

inputs = tokenizer(text, return_tensors="np", padding=True, truncation=True)
inputs = {key: np.array(val, dtype=np.int64) for key, val in inputs.items()}
tasks.append(asyncio.create_task(model_inference(model, model_data["id2label"], inputs)))
results = await asyncio.gather(*tasks)
results_dict = dict(zip(list(models.keys()), results))
end_time = time.time()
print(end_time - start_time)

return results_dict

if __name__ == "__main__":
app.run(host="0.0.0.0")
59 changes: 59 additions & 0 deletions solution/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
from optimum.onnxruntime import ORTModelForSequenceClassification
from onnxruntime.transformers.io_binding_helper import IOBindingHelper
from onnxruntime.transformers.optimizer import optimize_model
import torch

models = {
"cardiffnlp": {
"model_name": "cardiffnlp/twitter-xlm-roberta-base-sentiment",
"tokenizer": None,
"model": None,
},
"ivanlau": {
"model_name": "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base",
"tokenizer": None,
"model": None,
},
"svalabs": {
"model_name": "svalabs/twitter-xlm-roberta-crypto-spam",
"tokenizer": None,
"model": None,
},
"EIStakovskii": {
"model_name": "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus",
"tokenizer": None,
"model": None,
},
"jy46604790": {
"model_name": "jy46604790/Fake-News-Bert-Detect",
"tokenizer": None,
"model": None,
},
}

def download_model(model_name):
optimized_onnx_path = f"optimized_{model_name}"
model_data = models[model_name]
model_data["optimized_onnx_path"] = os.path.join("models", optimized_onnx_path)
model_dir = os.path.join("models", model_name)

if not os.path.exists(model_dir):
os.makedirs(model_dir)

model = ORTModelForSequenceClassification.from_pretrained(model_data["model_name"], export=True, use_io_binding=False).to(device)
model.save_pretrained(str("models/" + optimized_onnx_path))
model = optimize_model(
input=str("models/" + optimized_onnx_path + "/model.onnx"),
model_type="bert",
use_gpu=True
)
model.save_model_to_file(str("models/" + optimized_onnx_path + "/model.onnx"))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for model_name in models:
torch.cuda.empty_cache()
download_model(model_name)
print(model_name)
10 changes: 10 additions & 0 deletions solution/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
fastapi[all]
tritonclient[all]==2.34.0
pydantic==1.10.9
cachetools==5.3.1
onnxruntime_gpu==1.15.1
numpy
transformers==4.29.2
torch==2.0.0
optimum==1.8.7
onnx==1.13.1