Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(sdk): Generate external Kubernetes and JobSet models #2466

Merged
merged 8 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ exclude: |
(?x)^(
docs/images/.*|
pkg/client/.*|
sdk/kubeflow/trainer/[^/]*.py|
sdk/kubeflow/trainer/__init__.py|
sdk/kubeflow/trainer/api/__init__.py|
sdk/kubeflow/trainer/models/.*|
sdk/docs/.*
)$
50 changes: 25 additions & 25 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"metadata": {
"description": "Standard object's metadata.",
"default": {},
"$ref": "#/definitions/v1.ObjectMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta"
},
"spec": {
"description": "Specification of the desired ClusterTrainingRuntime.",
Expand Down Expand Up @@ -55,7 +55,7 @@
"metadata": {
"description": "Standard list metadata.",
"default": {},
"$ref": "#/definitions/v1.ListMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ListMeta"
}
}
},
Expand Down Expand Up @@ -89,7 +89,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvVar"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvVar"
},
"x-kubernetes-list-map-keys": [
"name"
Expand All @@ -101,7 +101,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvFromSource"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvFromSource"
},
"x-kubernetes-list-type": "atomic"
},
Expand All @@ -115,7 +115,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.VolumeMount"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.VolumeMount"
},
"x-kubernetes-list-map-keys": [
"name"
Expand Down Expand Up @@ -144,7 +144,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvVar"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvVar"
},
"x-kubernetes-list-map-keys": [
"name"
Expand All @@ -153,7 +153,7 @@
},
"secretRef": {
"description": "Reference to the secret with credentials to download dataset. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the dataset provider.",
Expand All @@ -170,7 +170,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvVar"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvVar"
},
"x-kubernetes-list-map-keys": [
"name"
Expand All @@ -179,7 +179,7 @@
},
"secretRef": {
"description": "Reference to the secret with credentials to download model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model provider.",
Expand All @@ -194,12 +194,12 @@
"metadata": {
"description": "Metadata for custom JobSet's labels and annotations. JobSet name and namespace is equal to the TrainJob's name and namespace.",
"default": {},
"$ref": "#/definitions/v1.ObjectMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta"
},
"spec": {
"description": "Specification of the desired JobSet which will be created from TrainJob.",
"default": {},
"$ref": "#/definitions/jobset.v1alpha2.JobSetSpec"
"$ref": "https://raw.githubusercontent.com/kubernetes-sigs/jobset/d5c7bcebe739a4577e30944370c2d7a68321a929/hack/python-sdk/swagger.json#/definitions/jobset.v1alpha2.JobSetSpec"
}
}
},
Expand Down Expand Up @@ -330,7 +330,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvVar"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvVar"
},
"x-kubernetes-list-map-keys": [
"name"
Expand All @@ -339,7 +339,7 @@
},
"secretRef": {
"description": "Reference to the secret with credentials to export model. Secret must be created in the TrainJob's namespace.",
"$ref": "#/definitions/v1.LocalObjectReference"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.LocalObjectReference"
},
"storageUri": {
"description": "Storage uri for the model exporter.",
Expand Down Expand Up @@ -424,7 +424,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.Toleration"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.Toleration"
},
"x-kubernetes-list-type": "atomic"
},
Expand All @@ -433,7 +433,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.Volume"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.Volume"
},
"x-kubernetes-list-map-keys": [
"name"
Expand Down Expand Up @@ -496,7 +496,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/k8s.io.api.autoscaling.v2.MetricSpec"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.autoscaling.v2.MetricSpec"
},
"x-kubernetes-list-type": "atomic"
},
Expand All @@ -517,7 +517,7 @@
},
"numProcPerNode": {
"description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.util.intstr.IntOrString"
}
}
},
Expand All @@ -536,7 +536,7 @@
"metadata": {
"description": "Standard object's metadata.",
"default": {},
"$ref": "#/definitions/v1.ObjectMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta"
},
"spec": {
"description": "Specification of the desired TrainJob.",
Expand Down Expand Up @@ -576,7 +576,7 @@
"metadata": {
"description": "Standard list metadata.",
"default": {},
"$ref": "#/definitions/v1.ListMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ListMeta"
}
}
},
Expand Down Expand Up @@ -648,7 +648,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.Condition"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Condition"
},
"x-kubernetes-list-map-keys": [
"type"
Expand Down Expand Up @@ -698,7 +698,7 @@
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/v1.EnvVar"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.EnvVar"
},
"x-kubernetes-list-map-keys": [
"name"
Expand All @@ -716,11 +716,11 @@
},
"numProcPerNode": {
"description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.util.intstr.IntOrString"
},
"resourcesPerNode": {
"description": "Compute resources for each training node.",
"$ref": "#/definitions/v1.ResourceRequirements"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.api.core.v1.ResourceRequirements"
}
}
},
Expand All @@ -739,7 +739,7 @@
"metadata": {
"description": "Standard object's metadata.",
"default": {},
"$ref": "#/definitions/v1.ObjectMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta"
},
"spec": {
"description": "Specification of the desired TrainingRuntime.",
Expand Down Expand Up @@ -774,7 +774,7 @@
"metadata": {
"description": "Standard list metadata.",
"default": {},
"$ref": "#/definitions/v1.ListMeta"
"$ref": "https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/v1.32.2/api/openapi-spec/swagger.json#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ListMeta"
}
}
},
Expand Down
9 changes: 8 additions & 1 deletion hack/e2e-setup-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@ cd manifests/overlays/manager
kustomize edit set image kubeflow/trainer-controller-manager=${CONTROLLER_MANAGER_CI_IMAGE}

echo "Create Kind cluster and load Kubeflow Trainer images"
${KIND} create cluster --image "${KIND_NODE_VERSION}"
cat <<EOF | ${KIND} create cluster --image "${KIND_NODE_VERSION}" --config=-
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this config, so we can add more Kind workers in the future to decrease kube-scheduler time for placing Pods.
I think, that should speedup our tests once we run more TrainJobs at the same time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you open a separate PR? Because this seems not to related to openapi-gen.
If we find any issue for Kind config adding, the dedicated PR allows us to easily revert and fix that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let me open it.

kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
EOF

${KIND} load docker-image ${CONTROLLER_MANAGER_CI_IMAGE}

echo "Deploy Kubeflow Trainer control plane"
Expand Down
53 changes: 25 additions & 28 deletions hack/python-sdk/gen-sdk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,41 +21,30 @@ set -o nounset

# TODO (andreyvelich): Read this data from the global VERSION file.
SDK_VERSION="0.1.0"

SDK_OUTPUT_PATH="sdk"

SWAGGER_JAR_URL="https://repo1.maven.org/maven2/org/openapitools/openapi-generator-cli/4.3.1/openapi-generator-cli-4.3.1.jar"
SWAGGER_CODEGEN_JAR="hack/python-sdk/openapi-generator-cli.jar"
OPENAPI_GENERATOR_VERSION="v7.11.0"
TRAINER_ROOT="$(pwd)"
SWAGGER_CODEGEN_CONF="hack/python-sdk/swagger_config.json"
SWAGGER_CODEGEN_FILE="api/openapi-spec/swagger.json"

if [[ ! -f "$SWAGGER_CODEGEN_JAR" ]]; then
echo "Downloading the openapi-generator-cli JAR package to generate SDK"
wget -O "${SWAGGER_CODEGEN_JAR}" ${SWAGGER_JAR_URL}
fi

echo "Generating Python SDK for Kubeflow Trainer V2 ..."
java -jar "${SWAGGER_CODEGEN_JAR}" generate -i "${SWAGGER_CODEGEN_FILE}" -g python \
-o "${SDK_OUTPUT_PATH}" \
-c "${SWAGGER_CODEGEN_CONF}" \
# We need to add user to allow container override existing files.
docker run --user "$(id -u)":"$(id -g)" --rm \
-v "${TRAINER_ROOT}:/local" docker.io/openapitools/openapi-generator-cli:${OPENAPI_GENERATOR_VERSION} generate \
-g python \
-i "local/${SWAGGER_CODEGEN_FILE}" \
-c "local/${SWAGGER_CODEGEN_CONF}" \
-o "local/${SDK_OUTPUT_PATH}" \
-p=packageVersion="${SDK_VERSION}" \
--global-property apiTests=false,modelTests=false # TODO (andreyvelich): Discuss if we should use these test files.
--global-property models,modelTests=false,modelDocs=false,supportingFiles=__init__.py

echo "Removing unused files for the Python SDK"
git clean -f ${SDK_OUTPUT_PATH}/.openapi-generator
git clean -f ${SDK_OUTPUT_PATH}/.gitignore
git clean -f ${SDK_OUTPUT_PATH}/.gitlab-ci.yml
git clean -f ${SDK_OUTPUT_PATH}/git_push.sh
git clean -f ${SDK_OUTPUT_PATH}/.openapi-generator-ignore
git clean -f ${SDK_OUTPUT_PATH}/.travis.yml
git clean -f ${SDK_OUTPUT_PATH}/requirements.txt
git clean -f ${SDK_OUTPUT_PATH}/setup.cfg
git clean -f ${SDK_OUTPUT_PATH}/setup.py
git clean -f ${SDK_OUTPUT_PATH}/test-requirements.txt
git clean -f ${SDK_OUTPUT_PATH}/tox.ini
git clean -f ${SDK_OUTPUT_PATH}/.github
git clean -f ${SDK_OUTPUT_PATH}/test

# Revert the README since it is manually created.
git checkout ${SDK_OUTPUT_PATH}/README.md
# Revert manually created files.
git checkout ${SDK_OUTPUT_PATH}/kubeflow/trainer/__init__.py

# Manually modify the SDK version in the __init__.py file.
Expand All @@ -65,7 +54,15 @@ else
sed -i -e "s/__version__.*/__version__ = \"${SDK_VERSION}\"/" ${SDK_OUTPUT_PATH}/kubeflow/trainer/__init__.py
fi

# Kubeflow models must have Kubernetes models to perform serialization.
printf "\n# Import Kubernetes and JobSet models for the serialization. \n" >>${SDK_OUTPUT_PATH}/kubeflow/trainer/models/__init__.py
printf "from kubernetes.client import *\n" >>${SDK_OUTPUT_PATH}/kubeflow/trainer/models/__init__.py
printf "from jobset.models import *\n" >>${SDK_OUTPUT_PATH}/kubeflow/trainer/models/__init__.py
# The `model_config` property conflicts with Pydantic name.
# Therefore, we rename it to `model_config_crd`
TRAINJOB_SPEC_MODEL=${SDK_OUTPUT_PATH}/kubeflow/trainer/models/trainer_v1alpha1_train_job_spec.py
if [[ $(uname) == "Darwin" ]]; then
sed -i '' -e "s/model_config/model_config_crd/" ${TRAINJOB_SPEC_MODEL}
sed -i '' -e "s/model_config_crd = ConfigDict/model_config = ConfigDict/" ${TRAINJOB_SPEC_MODEL}
sed -i '' -e "s/kubeflow.trainer.models.trainer_v1alpha1_model_config_crd/kubeflow.trainer.models.trainer_v1alpha1_model_config/" ${TRAINJOB_SPEC_MODEL}
else
sed -i -e "s/model_config/model_config_crd/" ${TRAINJOB_SPEC_MODEL}
sed -i -e "s/model_config_crd = ConfigDict/model_config = ConfigDict/" ${TRAINJOB_SPEC_MODEL}
sed -i -e "s/kubeflow.trainer.models.trainer_v1alpha1_model_config_crd/kubeflow.trainer.models.trainer_v1alpha1_model_config/" ${TRAINJOB_SPEC_MODEL}
fi
6 changes: 1 addition & 5 deletions hack/python-sdk/swagger_config.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
{
"packageName": "kubeflow.trainer",
"typeMappings": {
"K8sIoApiAutoscalingV2MetricSpec": "V2MetricSpec",
"K8sIoApimachineryPkgUtilIntstrIntOrString": "object",
"V1Time": "datetime"
}
"typeMappings": {}
}
38 changes: 35 additions & 3 deletions hack/swagger/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"encoding/json"
"fmt"
"runtime/debug"
"strings"

"k8s.io/klog/v2"
Expand All @@ -30,12 +31,44 @@ import (

// Generate Kubeflow Training OpenAPI specification.
func main() {
// Get Kubernetes and JobSet version
var k8sVersion string
var jobSetVersion string

info, ok := debug.ReadBuildInfo()
if !ok {
fmt.Println("Failed to read build info")
return
}

for _, dep := range info.Deps {
if dep.Path == "k8s.io/api" {
k8sVersion = strings.Replace(dep.Version, "v0.", "v1.", -1)
} else if dep.Path == "sigs.k8s.io/jobset" {
jobSetVersion = dep.Version
}
Comment on lines +44 to +49
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this look good to fetch Kubernetes and JobSet version ?

}
if k8sVersion == "" || jobSetVersion == "" {
fmt.Println("OpenAPI spec generation failed. Unable to get Kubernetes and JobSet version")
return
}

k8sOpenAPISpec := fmt.Sprintf("https://raw.githubusercontent.com/kubernetes/kubernetes/refs/tags/%s/api/openapi-spec/swagger.json", k8sVersion)
// TODO (andreyvelich): Use the release version once this JobSet commit is released: d5c7bce.
// jobSetOpenAPISpec := fmt.Sprintf("https://raw.githubusercontent.com/kubernetes-sigs/jobset/refs/tags/%s/hack/python-sdk/swagger.json", jobSetVersion)
jobSetOpenAPISpec := "https://raw.githubusercontent.com/kubernetes-sigs/jobset/d5c7bcebe739a4577e30944370c2d7a68321a929/hack/python-sdk/swagger.json"

var oAPIDefs = map[string]common.OpenAPIDefinition{}
defs := spec.Definitions{}

refCallback := func(name string) spec.Ref {
return spec.MustCreateRef("#/definitions/" + common.EscapeJsonPointer(swaggify(name)))
if strings.HasPrefix(name, "k8s.io") {
return spec.MustCreateRef(k8sOpenAPISpec + "#/definitions/" + swaggify(name))
} else if strings.HasPrefix(name, "sigs.k8s.io/jobset") {
return spec.MustCreateRef(jobSetOpenAPISpec + "#/definitions/" + swaggify(name))
}
return spec.MustCreateRef("#/definitions/" + swaggify(name))

}

for k, v := range trainer.GetOpenAPIDefinitions(refCallback) {
Expand Down Expand Up @@ -67,8 +100,7 @@ func main() {
func swaggify(name string) string {
name = strings.Replace(name, "github.com/kubeflow/trainer/pkg/apis/", "", -1)
name = strings.Replace(name, "sigs.k8s.io/jobset/api/", "", -1)
name = strings.Replace(name, "k8s.io/api/core/", "", -1)
name = strings.Replace(name, "k8s.io/apimachinery/pkg/apis/meta/", "", -1)
name = strings.Replace(name, "k8s.io", "io.k8s", -1)
name = strings.Replace(name, "/", ".", -1)
return name
}
Loading
Loading