Skip to content

Commit

Permalink
Check if runtime label presents
Browse files Browse the repository at this point in the history
Signed-off-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
andreyvelich committed Nov 21, 2024
1 parent 2abf580 commit 4fd46c5
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 10 deletions.
20 changes: 14 additions & 6 deletions sdk_v2/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def list_runtimes(self) -> List[types.Runtime]:
# For that, we need to import the JobSet models.
response = thread.get(constants.DEFAULT_TIMEOUT)
for item in response["items"]:
# TODO (andreyvelich): Currently, the training phase label must be presented.
# TODO (andreyvelich): Currently, the labels must be presented.
if "labels" in item["metadata"]:
# Get the Trainer container resources.
resources = None
Expand All @@ -123,16 +123,24 @@ def list_runtimes(self) -> List[types.Runtime]:
# TODO (andreyvelich): Currently, we get the device type from
# the runtime labels.
_, device_count = utils.get_container_devices(resources, num_procs)
if device_count != constants.UNKNOWN_DEVICE:
if device_count != constants.UNKNOWN:
device_count = str(
int(device_count)
* int(item["spec"]["mlPolicy"]["numNodes"])
)

runtime = types.Runtime(
name=item["metadata"]["name"], # type: ignore
phase=item["metadata"]["labels"][constants.PHASE_KEY], # type: ignore
device=item["metadata"]["labels"][constants.DEVICE_KEY], # type: ignore
name=item["metadata"]["name"],
phase=(
item["metadata"]["labels"][constants.PHASE_KEY]
if constants.PHASE_KEY in item["metadata"]["labels"]
else constants.UNKNOWN
),
device=(
item["metadata"]["labels"][constants.DEVICE_KEY]
if constants.DEVICE_KEY in item["metadata"]["labels"]
else constants.UNKNOWN
),
device_count=device_count,
)

Expand Down Expand Up @@ -501,7 +509,7 @@ def __get_trainjob_from_crd(
container.resources
)
# If resources are not set in containers, we can't get the device.
if device == constants.UNKNOWN_DEVICE:
if device == constants.UNKNOWN:
device_count = device
break
device_count = str(int(device_count) + int(dc))
Expand Down
4 changes: 2 additions & 2 deletions sdk_v2/kubeflow/training/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
# TODO: Potentially, we should get this data from the Node selectors.
DEVICE_KEY = "training.kubeflow.org/device"

# This values indicates that device or number of devices are unknown for the container.
UNKNOWN_DEVICE = "Unknown"
# Unknown indicates that the value can't be identified.
UNKNOWN = "Unknown"

# The label for CPU device in the container resources.
CPU_DEVICE_LABEL = "cpu"
Expand Down
4 changes: 2 additions & 2 deletions sdk_v2/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def get_container_devices(
# TODO (andreyvelich): We should discuss how to get container device type.
# Potentially, we can use the training.kubeflow.org/device label from the runtime or
# node types.
device = constants.UNKNOWN_DEVICE
device_count = constants.UNKNOWN_DEVICE
device = constants.UNKNOWN
device_count = constants.UNKNOWN

# If containers resource limits are empty, return Unknown.
if resources is None or resources.limits is None:
Expand Down

0 comments on commit 4fd46c5

Please sign in to comment.