-
Notifications
You must be signed in to change notification settings - Fork 111
/
Copy pathdeployment.yaml
106 lines (106 loc) · 2.73 KB
/
deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
kind: Deployment
apiVersion: apps/v1
metadata:
name: vllm
labels:
app: vllm
spec:
replicas: 1
selector:
matchLabels:
app: vllm
template:
metadata:
creationTimestamp: null
labels:
app: vllm
spec:
restartPolicy: Always
schedulerName: default-scheduler
affinity: {}
terminationGracePeriodSeconds: 120
securityContext: {}
containers:
- resources:
limits:
cpu: '8'
memory: 24Gi
nvidia.com/gpu: '1'
requests:
cpu: '6'
readinessProbe:
httpGet:
path: /health
port: http
scheme: HTTP
timeoutSeconds: 5
periodSeconds: 30
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
name: server
livenessProbe:
httpGet:
path: /health
port: http
scheme: HTTP
timeoutSeconds: 8
periodSeconds: 100
successThreshold: 1
failureThreshold: 3
env:
- name: HUGGING_FACE_HUB_TOKEN
value: ''
args: [
"--model",
"mistralai/Mistral-7B-Instruct-v0.2",
"--download-dir",
"/models-cache",
"--dtype", "float16",
"--max-model-len", "6144" ]
securityContext:
capabilities:
drop:
- ALL
runAsNonRoot: true
allowPrivilegeEscalation: false
seccompProfile:
type: RuntimeDefault
ports:
- name: http
containerPort: 8000
protocol: TCP
imagePullPolicy: IfNotPresent
startupProbe:
httpGet:
path: /health
port: http
scheme: HTTP
timeoutSeconds: 1
periodSeconds: 30
successThreshold: 1
failureThreshold: 24
volumeMounts:
- name: models-cache
mountPath: /models-cache
- name: shm
mountPath: /dev/shm
terminationMessagePolicy: File
image: 'quay.io/rh-aiservices-bu/vllm-openai-ubi9:0.4.2'
volumes:
- name: models-cache
persistentVolumeClaim:
claimName: vllm-models-cache
- name: shm
emptyDir:
medium: Memory
sizeLimit: 1Gi
dnsPolicy: ClusterFirst
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
strategy:
type: Recreate
revisionHistoryLimit: 10
progressDeadlineSeconds: 600