From 378205176e80edfa73ff6b1f9b952c497445bb41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Wed, 19 Feb 2025 15:21:10 +0000 Subject: [PATCH 1/3] add draft for gcs state bucket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/commands/cluster_gcluster.py | 19 +++++++++++++++++++ src/xpk/parser/cluster.py | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/xpk/commands/cluster_gcluster.py b/src/xpk/commands/cluster_gcluster.py index 908c6a58..753ce188 100644 --- a/src/xpk/commands/cluster_gcluster.py +++ b/src/xpk/commands/cluster_gcluster.py @@ -14,6 +14,7 @@ limitations under the License. """ +from ..core.commands import run_command_for_value from ..core.blueprint.blueprint_generator import BlueprintGenerator, BlueprintGeneratorOutput, supported_device_types, a3mega_device_type, a3ultra_device_type from ..core.docker_manager import DockerManager from ..core.gcluster_manager import GclusterManager @@ -144,6 +145,19 @@ def prepare_blueprint_generator() -> BlueprintGenerator: return BlueprintGenerator(storage_path=blueprints_path) +def validate_state_gcs_bucket(args): + bucket_validate_cmd = ( + f'gcloud storage buckets describe gs://{args.cluster_state_gcs_bucket}' + ) + err_code, _ = run_command_for_value( + bucket_validate_cmd, + 'Validate remote state bucket existence.', + global_args=args, + ) + if err_code != 0: + xpk_exit(err_code) + + def generate_blueprint( blueprint_name, args, prefix=None ) -> BlueprintGeneratorOutput: @@ -154,6 +168,9 @@ def generate_blueprint( bpg = prepare_blueprint_generator() + if args.cluster_state_gcs_bucket is not None: + validate_state_gcs_bucket(args.cluster_state_gcs_bucket) + if args.device_type in supported_device_types: if args.device_type == a3mega_device_type: num_nodes = args.num_nodes if not args.num_nodes is None else 2 @@ -170,6 +187,7 @@ def generate_blueprint( capacity_type=capacity_type, system_node_pool_machine_type=args.default_pool_cpu_machine_type, system_node_pool_min_node_count=args.default_pool_cpu_num_nodes, + gcs_bucket=args.cluster_state_gcs_bucket, ) if args.device_type == a3ultra_device_type: num_nodes = args.num_nodes if not args.num_nodes is None else 2 @@ -186,5 +204,6 @@ def generate_blueprint( capacity_type=capacity_type, system_node_pool_machine_type=args.default_pool_cpu_machine_type, system_node_pool_min_node_count=args.default_pool_cpu_num_nodes, + gcs_bucket=args.cluster_state_gcs_bucket, ) return None diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py index 6aa15d12..4c723651 100644 --- a/src/xpk/parser/cluster.py +++ b/src/xpk/parser/cluster.py @@ -26,6 +26,8 @@ from ..core.core import DEFAULT_VERTEX_TENSORBOARD_NAME from .common import add_shared_arguments from .validators import name_type +from ..commands.config import xpk_cfg +from ..core.config import CFG_BUCKET_KEY def set_cluster_parser(cluster_parser): @@ -83,6 +85,13 @@ def set_cluster_parser(cluster_parser): ) ### Optional arguments specific to "cluster create" + cluster_create_optional_arguments.add_argument( + '--cluster-state-gcs-bucket', + type=str, + default=xpk_cfg.get(CFG_BUCKET_KEY), + help='The name of the bucket to store cluster state.', + required=False, + ) cluster_create_optional_arguments.add_argument( '--num-nodes', type=int, @@ -277,6 +286,13 @@ def set_cluster_parser(cluster_parser): ) ### Optional Arguments + cluster_delete_optional_arguments.add_argument( + '--cluster-state-gcs-bucket', + type=str, + default=xpk_cfg.get(CFG_BUCKET_KEY), + help='The name of the bucket to store cluster state.', + required=False, + ) add_shared_arguments(cluster_delete_optional_arguments) cluster_delete_parser.set_defaults(func=cluster_delete) cluster_delete_parser.add_argument( From f5bf8051f557d10f465290b70229e86aae567f3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Thu, 27 Feb 2025 14:12:03 +0000 Subject: [PATCH 2/3] fix builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- .github/workflows/build_tests.yaml | 6 +++--- .github/workflows/cluster_create.yaml | 2 +- .github/workflows/cluster_private.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index a02079e0..ac7ca593 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -67,15 +67,15 @@ jobs: - name: set zone id: set-zone run: | - echo zone=europe-west4-b >> $GITHUB_OUTPUT + echo zone=us-central2-b >> $GITHUB_OUTPUT - name: set tpu-type id: set-tpu-type run: | - echo tpu-type=v5p-8 >> $GITHUB_OUTPUT + echo tpu-type=v4-8 >> $GITHUB_OUTPUT - name: set location id: set-location run: | - echo location=europe-west4 >> $GITHUB_OUTPUT + echo location=us-central2 >> $GITHUB_OUTPUT install-dependencies: needs: [set-variables] runs-on: ubuntu-22.04 diff --git a/.github/workflows/cluster_create.yaml b/.github/workflows/cluster_create.yaml index 23571e69..9203988c 100644 --- a/.github/workflows/cluster_create.yaml +++ b/.github/workflows/cluster_create.yaml @@ -79,4 +79,4 @@ jobs: - name: Check xpk installation run: xpk --help - name: Create a Pathways-enabled XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing. - run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ No newline at end of file + run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ No newline at end of file diff --git a/.github/workflows/cluster_private.yaml b/.github/workflows/cluster_private.yaml index 677f8411..06ac69a2 100644 --- a/.github/workflows/cluster_private.yaml +++ b/.github/workflows/cluster_private.yaml @@ -78,7 +78,7 @@ jobs: - name: Check xpk installation run: xpk --help - name: Create a Pathways-enabled private XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing. - run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V5_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" + run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" - name: Verify the created cluster is private run: gcloud container clusters describe ${{inputs.cluster-name}}-private --location=${{inputs.location}} --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1) - name: Delete the cluster created From e9ec586990af056d8d783793e2f5de7b081b0938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Thu, 27 Feb 2025 14:12:59 +0000 Subject: [PATCH 3/3] fix choices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- .github/workflows/build_tests.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index ac7ca593..ac398d8b 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -20,11 +20,10 @@ on: tpu-type: description: 'TPU Type' required: true - default: 'v5p-8' + default: 'v4-8' type: choice options: - - v5p-8 - - v5litepod-8 + - v4-8 push: branches: ["main","develop"] pull_request: # By default this runs for types assigned, opened and synchronize.