Skip to content

Commit

Permalink
Fix nightly tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
mbobrovskyi committed Mar 7, 2025
1 parent 05efeb5 commit 38e3a2d
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ on:
- v4-8
push:
branches: ["main","develop"]
pull_request: # By default this runs for types assigned, opened and synchronize.
# pull_request: # By default this runs for types assigned, opened and synchronize.

jobs:
set-variables:
Expand Down
32 changes: 18 additions & 14 deletions .github/workflows/nightly_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ on:
env:
# Names must be unique in parallel running tests.
EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v5p-8-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v4-8-nodepools
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-v5p-8-nodepools
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-v4-8-nodepools
PATHWAYS_WORKLOAD_NAME: xpkpw-nightly-${{ github.run_attempt }}
CLUSTER_NETWORK_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}}"
RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-v5p-8-nodepools
RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-v4-8-nodepools

jobs:
cluster-create-and-delete:
Expand Down Expand Up @@ -59,28 +59,32 @@ jobs:
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create a Private XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
- name: Verify the created cluster is private
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create an XPK Cluster with 2x v5p-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Create an XPK Cluster with 2x v4-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run xpk inspector with the workload created above
Expand Down Expand Up @@ -157,12 +161,12 @@ jobs:
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an Pathways-enabled XPK Cluster with 2 x v5p-8 nodepools
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
- name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: Delete the Pathways workload on the cluster
Expand Down Expand Up @@ -198,8 +202,8 @@ jobs:
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create a RayCluster-enabled XPK Cluster with 2 x v5p-8 nodepools
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
- name: Create a RayCluster-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
- name: Delete the RayCluster-enabled XPK cluster
if: always()
run: python xpk.py cluster delete --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --zone=us-central2-b
Expand Down

0 comments on commit 38e3a2d

Please sign in to comment.