Skip to content

Commit

Permalink
generate explicit autopilot anti-affinities (#104)
Browse files Browse the repository at this point in the history
Works around project-codeflare/appwrapper#259
as a stopgap until RHOAI 2.16 is released.
  • Loading branch information
dgrove-oss authored Nov 20, 2024
1 parent b9010d8 commit ae778d1
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 1 deletion.
8 changes: 7 additions & 1 deletion tools/pytorchjob-generator/chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,18 @@ terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
schedulerName: default-scheduler
{{- end }}
priorityClassName: {{ .Values.priority }}
{{- if .Values.hostIgnoreList }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
{{- if .Values.hostIgnoreList }}
- key: kubernetes.io/hostname
operator: NotIn
values:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ Adding Volume Mounts:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -85,6 +96,17 @@ Adding Volume Mounts:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -165,6 +187,17 @@ Adding initContainers:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -227,6 +260,17 @@ Adding initContainers:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -310,6 +354,17 @@ AppWrapper metadata should match snapshot:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -359,6 +414,17 @@ AppWrapper metadata should match snapshot:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -429,6 +495,17 @@ AppWrapper spec should match snapshot:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -478,6 +555,17 @@ AppWrapper spec should match snapshot:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -548,6 +636,17 @@ Enabling NVMe:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -612,6 +711,17 @@ Enabling NVMe:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -699,6 +809,17 @@ Enabling RoCE GDR:
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -764,6 +885,17 @@ Enabling RoCE GDR:
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -850,6 +982,17 @@ Enabling all advanced features at once:
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -967,6 +1110,17 @@ Enabling all advanced features at once:
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -1103,6 +1257,17 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down Expand Up @@ -1166,6 +1331,17 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
metadata:
namespace: my-namespace
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
Expand Down

0 comments on commit ae778d1

Please sign in to comment.