Port infra from tpu-pytorch to tpu-pytorch-releases and support v4, v…

…5p, v6e CI run
pytorch · Jan 15, 2025 · 8940757 · 8940757
1 parent 4edbf61
commit 8940757
Show file tree

Hide file tree

Showing 24 changed files with 552 additions and 99 deletions.
diff --git a/.github/ci.md b/.github/ci.md
@@ -57,7 +57,7 @@ For the C++ test groups in either case, the test binaries are pre-built during t
 
 The TPU CI runs only a subset of our tests due to capacity constraints, defined in `_tpu_ci.yml` `test/tpu/run_tests.sh`. The runners themselves are containers in GKE managed by [ARC](https://github.com/actions/actions-runner-controller). The container image is also based on our dev images, with some changes for ARC compatibility. The Dockerfile for this image lives in `test/tpu/Dockerfile`.
 
-The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch/tpu_ci.yml`.
+The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch-releases/tpu_ci.yml`.
 
 ### Reproducing test failures
 
@@ -95,7 +95,7 @@ If the TPU CI won't run, try to debug using the following steps:
 On your cloudtop:
 
 ```
-gcloud config set project tpu-pytorch
+gcloud config set project tpu-pytorch-releases
 gcloud container clusters get-credentials tpu-ci --location=us-central2
 ```
 

diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -1,36 +1,48 @@
 name: TPU Integration Test
 on:
   workflow_call:
+    inputs:
+      tpu-version:
+        required: true
+        type: string
+      runner-label:
+        required: true
+        type: string
+
 jobs:
   tpu-test:
-    runs-on: v4-runner-set
+    # Use dynamic runner based on TPU version
+    runs-on: ${{ inputs.runner-label }}-runner-set
     steps:
       - name: Checkout actions
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
+
       - name: Setup
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
           wheels-artifact: torch-xla-wheels
+
       - name: Install test dependencies
         shell: bash
         run: |
-          # TODO: Add these in setup.py
           pip install --upgrade pip
           pip install fsspec
           pip install rich
-          # Jax nightly is needed for pallas tests.
+          # Jax nightly is needed for pallas tests
           pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html
           pip install --upgrade protobuf
+
       - name: Run Tests
         env:
           PJRT_DEVICE: TPU
           TPU_LOG_DIR: tpu_logs
+          TPU_VERSION: ${{ inputs.tpu-version }}
         run: |
           cd pytorch/xla
           test/tpu/run_tests.sh
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -104,6 +104,13 @@ jobs:
     uses: ./.github/workflows/_tpu_ci.yml
     needs: build-torch-xla
     if: github.event_name == 'push' || github.event_name == 'pull_request'
+    strategy:
+      matrix:
+        tpu-version: ['v4', 'v5p', 'v6e']
+      fail-fast: false  # Continue running other TPU versions if one fails
+    with:
+      tpu-version: ${{ matrix.tpu-version }}
+      runner-label: ${{ format('tpu-{0}', matrix.tpu-version) }}
 
   push-docs:
     name: "Build docs"

diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -35,15 +35,15 @@ shift $(($OPTIND - 1))
 
 # func for test after ssh to VM, create container and execute in container
 function benchmarking_in_container {
-  sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8
+  sudo docker pull gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8
   sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent    software-properties-common
   nvidia-smi
   distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
   curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
   curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
   sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
   sudo systemctl restart docker
-  sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash
+  sudo docker run --gpus all -it -d gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8 bin/bash
   sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
   # install torchbench
   cd ~

diff --git a/docker/cloudbuild.yaml b/docker/cloudbuild.yaml
@@ -1,5 +1,5 @@
 # Cloud Build Configuration which:
-# (1) Builds, tests, and pushes gcr.io/tpu-pytorch/xla image
+# (1) Builds, tests, and pushes gcr.io/tpu-pytorch-releases/xla image
 # (2) Collects and stores torch and torch_xla wheels
 steps:
 - name: 'gcr.io/cloud-builders/docker'
@@ -16,20 +16,20 @@ steps:
           '--build-arg', 'cuda_compute=${_CUDA_COMPUTE}',
           '--build-arg', 'xla_branch=${_GITHUB_XLA_BRANCH}',
           '--build-arg', 'examle_branch=${_GITHUB_EXAMPLE_BRANCH}',
-          '-t', 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}',
+          '-t', 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}',
           '-f', 'docker/Dockerfile', '.'
         ]
   timeout: 14400s
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: bash
-  args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
-- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
+  args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
+- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
   entrypoint: bash
   args: ['-c', 'source /pytorch/xla/docker/common.sh && run_deployment_tests']
 - name: 'gcr.io/cloud-builders/docker'
-  args: ['push', '--all-tags', 'gcr.io/tpu-pytorch/xla']
+  args: ['push', '--all-tags', 'gcr.io/tpu-pytorch-releases/xla']
   timeout: 2700s
-- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
+- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
   entrypoint: 'bash'
   args: ['-c', 'source /pytorch/xla/docker/common.sh && collect_wheels ${_RELEASE_VERSION}']
 
@@ -48,12 +48,12 @@ substitutions:
     _GITHUB_EXAMPLE_BRANCH: 'master'
 options:
     pool:
-      name: 'projects/tpu-pytorch/locations/us-central1/workerPools/wheel_build'
+      name: 'projects/tpu-pytorch-releases/locations/us-central1/workerPools/wheel_build'
     dynamic_substitutions: true
     substitution_option: 'ALLOW_LOOSE'
 timeout: 32000s
 artifacts:
   objects:
     # CUDA wheels exported under `wheels/cuda/<cuda_version>`
-    location: 'gs://tpu-pytorch/wheels/$_UPLOAD_SUBDIR'
+    location: 'gs://tpu-pytorch-releases/wheels/$_UPLOAD_SUBDIR'
     paths: ['/**/*.whl']
diff --git a/docker/debug_cloudbuild.yaml b/docker/debug_cloudbuild.yaml
@@ -1,5 +1,5 @@
 # Cloud Build Configuration which:
-# Builds and pushes gcr.io/tpu-pytorch/xla_debug image.
+# Builds and pushes gcr.io/tpu-pytorch-releases/xla_debug image.
 steps:
 - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk:slim'
   args: ['bash', 'docker/debug_image_cleanup.sh']
@@ -11,16 +11,16 @@ steps:
           '--build-arg', 'python_version=${_PYTHON_VERSION}',
           '--build-arg', 'cloud_build=true',
           '--build-arg', 'release_version=${_RELEASE_VERSION}',
-          '-t', 'gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}',
-          '--cache-from', 'gcr.io/tpu-pytorch/xla_debug:nightly_3.6',
+          '-t', 'gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}',
+          '--cache-from', 'gcr.io/tpu-pytorch-releases/xla_debug:nightly_3.6',
           '-f', 'docker/Dockerfile', '.'
         ]
   timeout: 14400s
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: bash
-  args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
+  args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
 - name: 'gcr.io/cloud-builders/docker'
-  args: ['push', 'gcr.io/tpu-pytorch/xla_debug']
+  args: ['push', 'gcr.io/tpu-pytorch-releases/xla_debug']
   timeout: 1800s
 
 options:

diff --git a/docker/debug_image_cleanup.sh b/docker/debug_image_cleanup.sh
@@ -1,4 +1,4 @@
-IMAGE="gcr.io/tpu-pytorch/xla_debug"
+IMAGE="gcr.io/tpu-pytorch-releases/xla_debug"
 DATE=$(date --date='-90 days' +"%Y-%m-%dT%H:%M:%S")
 
 for digest in $(gcloud container images list-tags ${IMAGE} --limit=999999 --sort-by=TIMESTAMP --filter="timestamp.datetime < '${DATE}'" --format='get(digest)'); do

diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh
@@ -2,7 +2,7 @@
 
 # Explicitly source bashrc even when running commands directly.
 # Since commands run as a separate subshell, we need to source manually.
-# ex. docker run -it gcr.io/tpu-pytorch/xla:nightly bash ...
+# ex. docker run -it gcr.io/tpu-pytorch-releases/xla:nightly bash ...
 # The above will not source bashrc without entrypoint.
 source ~/.bashrc
 

diff --git a/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml
@@ -0,0 +1,18 @@
+githubConfigUrl: ${github_repo_url}
+githubConfigSecret: github-pat
+minRunners: 1
+maxRunners: ${max_tpu_nodes}
+template:
+  spec:
+    containers:
+    - name: runner
+      image: ${runner_image}
+      command: ["/home/runner/run.sh"]
+      resources:
+        limits:
+          google.com/tpu: 4
+        requests:
+          google.com/tpu: 4
+    nodeSelector:
+      cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice
+      cloud.google.com/gke-tpu-topology: 2x2x1
diff --git a/infra/terraform_modules/arc_v5p_container_cluster/main.tf b/infra/terraform_modules/arc_v5p_container_cluster/main.tf
@@ -0,0 +1,99 @@
+provider "google" {
+  project = var.project_id
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = "https://${google_container_cluster.arc_v5p_cluster.endpoint}"
+    token                  = data.google_client_config.default.access_token
+    cluster_ca_certificate = base64decode(google_container_cluster.arc_v5p_cluster.master_auth.0.cluster_ca_certificate)
+  }
+}
+
+data "google_client_config" "default" {}
+
+resource "google_container_cluster" "arc_v5p_cluster" {
+  name     = var.cluster_name
+  location = "us-central2"
+
+  remove_default_node_pool = true
+  initial_node_count       = 1
+
+  release_channel {
+    channel = "RAPID"
+  }
+
+  min_master_version = 1.28
+}
+
+resource "google_container_node_pool" "arc_v5p_cpu_nodes" {
+  name       = var.cpu_nodepool_name
+  location   = "us-central2"
+  cluster    = google_container_cluster.arc_v5p_cluster.name
+  node_count = var.cpu_node_count
+
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+  }
+
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "google_container_node_pool" "arc_v5p_tpu_nodes" {
+  name               = var.tpu_nodepool_name
+  location           = "us-central2"
+  node_locations     = ["us-central2-b"]
+  cluster            = google_container_cluster.arc_v5p_cluster.name
+  initial_node_count = 1
+  autoscaling {
+    total_min_node_count = 1
+    total_max_node_count = var.max_tpu_nodes
+    location_policy      = "ANY"
+  }
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+    machine_type = "ct5p-hightpu-4t"
+  }
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "helm_release" "arc" {
+  name             = "actions-runner-controller"
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller"
+  version          = "0.9.3"
+  namespace        = var.arc_namespace
+  create_namespace = true
+}
+
+resource "helm_release" "arc_runner_set" {
+  name = "v5p-runner-set"
+  depends_on = [
+    helm_release.arc
+  ]
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
+  version          = "0.9.3"
+  namespace        = var.runner_namespace
+  create_namespace = true
+
+  values = [
+    templatefile("../terraform_modules/arc_v5p_container_cluster/arc-values.yaml", {
+      github_repo_url = var.github_repo_url
+      max_tpu_nodes   = var.max_tpu_nodes
+      runner_image    = var.runner_image
+    })
+  ]
+}
diff --git a/infra/terraform_modules/arc_v5p_container_cluster/variables.tf b/infra/terraform_modules/arc_v5p_container_cluster/variables.tf
@@ -0,0 +1,51 @@
+variable "cluster_name" {
+  description = "Name of the Container Cluster containing the v5p node pool"
+  type        = string
+}
+
+variable "cpu_nodepool_name" {
+  description = "Name of the CPU Nodepool"
+  type        = string
+}
+
+variable "cpu_node_count" {
+  description = "Number of CPU nodes"
+  type        = number
+}
+
+variable "tpu_nodepool_name" {
+  description = "Name of the TPU Nodepool"
+  type        = string
+}
+
+variable "max_tpu_nodes" {
+  description = "Maximum number of TPU nodes and runners"
+  type        = number
+}
+
+variable "arc_namespace" {
+  description = "The namespace where ARC will reside"
+  default     = "arc-systems"
+  type        = string
+}
+
+variable "runner_namespace" {
+  description = "The namespace where the ARC runners will reside"
+  default     = "arc-runners"
+  type        = string
+}
+
+variable "github_repo_url" {
+  description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC"
+  type        = string
+}
+
+variable "project_id" {
+  description = "The project ID"
+  type        = string
+}
+
+variable "runner_image" {
+  description = "The Docker image used in the self-hosted runner"
+  type        = string
+}
diff --git a/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml
@@ -0,0 +1,18 @@
+githubConfigUrl: ${github_repo_url}
+githubConfigSecret: github-pat
+minRunners: 1
+maxRunners: ${max_tpu_nodes}
+template:
+  spec:
+    containers:
+    - name: runner
+      image: ${runner_image}
+      command: ["/home/runner/run.sh"]
+      resources:
+        limits:
+          google.com/tpu: 4
+        requests:
+          google.com/tpu: 4
+    nodeSelector:
+      cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+      cloud.google.com/gke-tpu-topology: 2x2