From f191a758990a53a023ee3eedf114314e4e6222a5 Mon Sep 17 00:00:00 2001 From: Rueian Date: Sun, 19 Jan 2025 15:15:29 -0800 Subject: [PATCH] [RayJob] RayJob deletion policy validation (#2771) Signed-off-by: Rueian --- .../controllers/ray/rayjob_controller.go | 16 ++++++++ .../ray/rayjob_controller_unit_test.go | 38 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 3e53f55a63f..4e54f7d2a73 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -902,6 +902,22 @@ func validateRayJobSpec(rayJob *rayv1.RayJob) error { if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 { return fmt.Errorf("backoffLimit must be a positive integer") } + if !features.Enabled(features.RayJobDeletionPolicy) && rayJob.Spec.DeletionPolicy != nil { + return fmt.Errorf("RayJobDeletionPolicy feature gate must be enabled to use the DeletionPolicy feature") + } + if rayJob.Spec.ClusterSelector != nil && + rayJob.Spec.DeletionPolicy != nil && *rayJob.Spec.DeletionPolicy == rayv1.DeleteClusterDeletionPolicy { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionPolicy=DeleteCluster") + } + if rayJob.Spec.ClusterSelector != nil && + rayJob.Spec.DeletionPolicy != nil && *rayJob.Spec.DeletionPolicy == rayv1.DeleteWorkersDeletionPolicy { + return fmt.Errorf("the ClusterSelector mode doesn't support DeletionPolicy=DeleteWorkers") + } + if rayJob.Spec.DeletionPolicy != nil && *rayJob.Spec.DeletionPolicy == rayv1.DeleteWorkersDeletionPolicy && + rayJob.Spec.RayClusterSpec.EnableInTreeAutoscaling != nil && *rayJob.Spec.RayClusterSpec.EnableInTreeAutoscaling { + // TODO (rueian): This can be supported in future Ray. We should check the RayVersion once we know the version. + return fmt.Errorf("DeletionPolicy=DeleteWorkers currently does not support RayClusterSpec.EnableInTreeAutoscaling") + } if rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.DeletionPolicy != nil && *rayJob.Spec.DeletionPolicy == rayv1.DeleteNoneDeletionPolicy { return fmt.Errorf("shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") } diff --git a/ray-operator/controllers/ray/rayjob_controller_unit_test.go b/ray-operator/controllers/ray/rayjob_controller_unit_test.go index c4993a78937..115896df13f 100644 --- a/ray-operator/controllers/ray/rayjob_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_unit_test.go @@ -21,6 +21,7 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" utils "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) func TestCreateRayJobSubmitterIfNeed(t *testing.T) { @@ -367,6 +368,43 @@ func TestValidateRayJobSpec(t *testing.T) { }) assert.ErrorContains(t, err, "backoffLimit must be a positive integer") + err = validateRayJobSpec(&rayv1.RayJob{ + Spec: rayv1.RayJobSpec{ + DeletionPolicy: ptr.To(rayv1.DeleteClusterDeletionPolicy), + ShutdownAfterJobFinishes: true, + RayClusterSpec: &rayv1.RayClusterSpec{}, + }, + }) + assert.ErrorContains(t, err, "RayJobDeletionPolicy feature gate must be enabled to use the DeletionPolicy feature") + + defer features.SetFeatureGateDuringTest(t, features.RayJobDeletionPolicy, true)() + + err = validateRayJobSpec(&rayv1.RayJob{ + Spec: rayv1.RayJobSpec{ + DeletionPolicy: ptr.To(rayv1.DeleteClusterDeletionPolicy), + ClusterSelector: map[string]string{"key": "value"}, + }, + }) + assert.ErrorContains(t, err, "the ClusterSelector mode doesn't support DeletionPolicy=DeleteCluster") + + err = validateRayJobSpec(&rayv1.RayJob{ + Spec: rayv1.RayJobSpec{ + DeletionPolicy: ptr.To(rayv1.DeleteWorkersDeletionPolicy), + ClusterSelector: map[string]string{"key": "value"}, + }, + }) + assert.ErrorContains(t, err, "the ClusterSelector mode doesn't support DeletionPolicy=DeleteWorkers") + + err = validateRayJobSpec(&rayv1.RayJob{ + Spec: rayv1.RayJobSpec{ + DeletionPolicy: ptr.To(rayv1.DeleteWorkersDeletionPolicy), + RayClusterSpec: &rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To[bool](true), + }, + }, + }) + assert.ErrorContains(t, err, "DeletionPolicy=DeleteWorkers currently does not support RayClusterSpec.EnableInTreeAutoscaling") + err = validateRayJobSpec(&rayv1.RayJob{ Spec: rayv1.RayJobSpec{ DeletionPolicy: ptr.To(rayv1.DeleteClusterDeletionPolicy),