Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix nightly tests #402

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions .github/workflows/nightly_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ on:
env:
# Names must be unique in parallel running tests.
EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v5p-8-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v5p-8-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v4-8-nodepools
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-v5p-8-nodepools
PATHWAYS_TPU_CLUSTER_NAME: pw-nightly-test-2-v4-8-nodepools
PATHWAYS_WORKLOAD_NAME: xpkpw-nightly-${{ github.run_attempt }}
CLUSTER_NETWORK_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}}"
RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-v5p-8-nodepools
RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-v4-8-nodepools

jobs:
cluster-create-and-delete:
Expand Down Expand Up @@ -59,28 +59,32 @@ jobs:
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create a Private XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v5p-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
- name: Verify the created cluster is private
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create an XPK Cluster with 2x v5p-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Create an XPK Cluster with 2x v4-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run xpk inspector with the workload created above
Expand Down Expand Up @@ -157,12 +161,12 @@ jobs:
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an Pathways-enabled XPK Cluster with 2 x v5p-8 nodepools
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
- name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
run: python xpk.py workload create-pathways --cluster $PATHWAYS_TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $PATHWAYS_TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
- name: Delete the Pathways workload on the cluster
Expand Down Expand Up @@ -198,8 +202,8 @@ jobs:
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create a RayCluster-enabled XPK Cluster with 2 x v5p-8 nodepools
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v5p-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
- name: Create a RayCluster-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-ray --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
- name: Delete the RayCluster-enabled XPK cluster
if: always()
run: python xpk.py cluster delete --cluster $RAYCLUSTER_TPU_CLUSTER_NAME --zone=us-central2-b
Expand Down
Loading