diff --git a/sdk_v2/kubeflow/training/api/training_client.py b/sdk_v2/kubeflow/training/api/training_client.py index 05f3f69078..ce0b6a4b1e 100644 --- a/sdk_v2/kubeflow/training/api/training_client.py +++ b/sdk_v2/kubeflow/training/api/training_client.py @@ -100,7 +100,7 @@ def list_runtimes(self) -> List[types.Runtime]: # For that, we need to import the JobSet models. response = thread.get(constants.DEFAULT_TIMEOUT) for item in response["items"]: - # TODO (andreyvelich): Currently, the training phase label must be presented. + # TODO (andreyvelich): Currently, the labels must be presented. if "labels" in item["metadata"]: # Get the Trainer container resources. resources = None @@ -123,16 +123,24 @@ def list_runtimes(self) -> List[types.Runtime]: # TODO (andreyvelich): Currently, we get the device type from # the runtime labels. _, device_count = utils.get_container_devices(resources, num_procs) - if device_count != constants.UNKNOWN_DEVICE: + if device_count != constants.UNKNOWN: device_count = str( int(device_count) * int(item["spec"]["mlPolicy"]["numNodes"]) ) runtime = types.Runtime( - name=item["metadata"]["name"], # type: ignore - phase=item["metadata"]["labels"][constants.PHASE_KEY], # type: ignore - device=item["metadata"]["labels"][constants.DEVICE_KEY], # type: ignore + name=item["metadata"]["name"], + phase=( + item["metadata"]["labels"][constants.PHASE_KEY] + if constants.PHASE_KEY in item["metadata"]["labels"] + else constants.UNKNOWN + ), + device=( + item["metadata"]["labels"][constants.DEVICE_KEY] + if constants.DEVICE_KEY in item["metadata"]["labels"] + else constants.UNKNOWN + ), device_count=device_count, ) @@ -501,7 +509,7 @@ def __get_trainjob_from_crd( container.resources ) # If resources are not set in containers, we can't get the device. - if device == constants.UNKNOWN_DEVICE: + if device == constants.UNKNOWN: device_count = device break device_count = str(int(device_count) + int(dc)) diff --git a/sdk_v2/kubeflow/training/constants/constants.py b/sdk_v2/kubeflow/training/constants/constants.py index 7ff2652251..08f668724a 100644 --- a/sdk_v2/kubeflow/training/constants/constants.py +++ b/sdk_v2/kubeflow/training/constants/constants.py @@ -45,8 +45,8 @@ # TODO: Potentially, we should get this data from the Node selectors. DEVICE_KEY = "training.kubeflow.org/device" -# This values indicates that device or number of devices are unknown for the container. -UNKNOWN_DEVICE = "Unknown" +# Unknown indicates that the value can't be identified. +UNKNOWN = "Unknown" # The label for CPU device in the container resources. CPU_DEVICE_LABEL = "cpu" diff --git a/sdk_v2/kubeflow/training/utils/utils.py b/sdk_v2/kubeflow/training/utils/utils.py index 794afef680..9c75d12af6 100644 --- a/sdk_v2/kubeflow/training/utils/utils.py +++ b/sdk_v2/kubeflow/training/utils/utils.py @@ -61,8 +61,8 @@ def get_container_devices( # TODO (andreyvelich): We should discuss how to get container device type. # Potentially, we can use the training.kubeflow.org/device label from the runtime or # node types. - device = constants.UNKNOWN_DEVICE - device_count = constants.UNKNOWN_DEVICE + device = constants.UNKNOWN + device_count = constants.UNKNOWN # If containers resource limits are empty, return Unknown. if resources is None or resources.limits is None: