Skip to content

Commit

Permalink
Add the option to use Multi-tier checkpointing in workloads
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinavclemson committed Feb 27, 2025
1 parent c8f2095 commit 01b0557
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/xpk/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2109,10 +2109,14 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
name: dshm-2"""

if args.ramdisk_directory != '':
volumes += """
driver = 'phase1-checkpoint.csi.storage.gke.io'
if args.mtc_enabled:
driver = 'multitier-checkpoint.csi.storage.gke.io'

volumes += f"""
- name: cache
csi:
driver: phase1-checkpoint.csi.storage.gke.io"""
driver: {driver}"""

if (
system.accelerator_type == AcceleratorType['TPU']
Expand Down
9 changes: 9 additions & 0 deletions src/xpk/parser/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,15 @@ def add_shared_workload_create_optional_arguments(args_parsers):
' be used with the CSI driver provided by GKE.'
),
)
custom_parser.add_argument(
'--mtc_enabled',
action='store_true',
help=(
'The workload can use multi-tier checkpointing controllers when the'
' --ramdisk-directory argument is used with this additional'
' argument.'
),
)


def add_shared_workload_create_env_arguments(args_parsers):
Expand Down

0 comments on commit 01b0557

Please sign in to comment.