From 397ea3ba07473b9a79c2518fb63635aaaa88373f Mon Sep 17 00:00:00 2001 From: Lixiang Luo Date: Thu, 19 Dec 2024 12:40:50 -0500 Subject: [PATCH] Sakkara support in MLBatch is added --- .../chart/templates/_helpers.tpl | 14 ++++++++-- .../chart/values.schema.json | 4 +++ .../examples/helloworld-sakkara.settings.yaml | 28 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index 34f751a..6da13f0 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -33,7 +33,7 @@ annotations: terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} {{- end }} {{- if .Values.bypassCoscheduler }} -schedulerName: default-scheduler +schedulerName: {{ .Values.schedulerName }} {{- end }} priorityClassName: {{ .Values.priority }} affinity: @@ -79,8 +79,14 @@ envFrom: - configMapRef: name: {{ .Values.ncclGdrEnvConfigMap }} {{- end }} -{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }} +{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }} env: + {{- if eq .Values.schedulerName "sakkara" }} + - name: SAKKARA_RANK + valueFrom: + fieldRef: + fieldPath: metadata.labels['sakkara.member.rank'] + {{- end }} {{- if .Values.topologyFileConfigMap }} - name: NCCL_TOPO_FILE value: /var/run/nvidia-topologyd/virtualTopology.xml @@ -144,6 +150,10 @@ command: # # User commands # + {{- if eq .Values.schedulerName "sakkara" }} + echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" + export RANK=$SAKKARA_RANK + {{- end }} {{- range $command := .Values.setupCommands }} {{ $command }} {{- end }} diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index d56f91f..dc8e17e 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -113,6 +113,10 @@ { "type": "array" } ]}, "bypassCoscheduler": { "type": "boolean" }, + "schedulerName": { "oneOf": [ + { "type": "null" }, + { "type": "string", "enum": ["sakkara", "default-scheduler" ] } + ]}, "serviceAccountName": { "oneOf" : [ { "type": "null" }, { "$ref": "#/$defs/rfc1123Label" } diff --git a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml new file mode 100644 index 0000000..8845b27 --- /dev/null +++ b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml @@ -0,0 +1,28 @@ +namespace: my-namespace # namespace to deploy to (required) +jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) +queueName: default-queue # local queue to submit to (default: default-queue) + +bypassCoscheduler: true +schedulerName: sakkara +# If additional constraints are used, specify the configmap here: +#customLabels: +# - key: sakkara.group.name +# value: my-topogrp-0 + +numPods: 4 # total pod count including master and worker pods (default: 1) +numCpusPerPod: 500m # requested number of cpus per pod (default: 1) +numGpusPerPod: 8 # requested number of gpus per pod (default: 0) +totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) + +priority: default-priority # default-priority (default), low-priority, or high-priority + +# container image for the pods (required) +containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + +# setup commands to run in each pod (optional) +setupCommands: +- git clone https://github.com/dbarnett/python-helloworld +- cd python-helloworld + +# main program to invoke via torchrun (optional) +mainProgram: helloworld.py