From 8940757e954037ba62596f04c54a7bf08c4c7381 Mon Sep 17 00:00:00 2001
From: zpcore <zpcore@gmail.com>
Date: Wed, 15 Jan 2025 00:10:11 +0000
Subject: [PATCH] Port infra from tpu-pytorch to tpu-pytorch-releases and
 support v4, v5p, v6e CI run

---
 .github/ci.md                                 |  4 +-
 .github/workflows/_tpu_ci.yml                 | 18 +++-
 .github/workflows/build_and_test.yml          |  7 ++
 benchmarks/run_benchmark.sh                   |  4 +-
 docker/cloudbuild.yaml                        | 16 +--
 docker/debug_cloudbuild.yaml                  | 10 +-
 docker/debug_image_cleanup.sh                 |  2 +-
 docker/docker-entrypoint.sh                   |  2 +-
 .../arc_v5p_container_cluster/arc-values.yaml | 18 ++++
 .../arc_v5p_container_cluster/main.tf         | 99 +++++++++++++++++++
 .../arc_v5p_container_cluster/variables.tf    | 51 ++++++++++
 .../arc_v6e_container_cluster/arc-values.yaml | 18 ++++
 .../arc_v6e_container_cluster/main.tf         | 99 +++++++++++++++++++
 .../arc_v6e_container_cluster/variables.tf    | 51 ++++++++++
 .../tpu-pytorch-releases/artifacts_builds.tf  | 71 -------------
 infra/tpu-pytorch-releases/iam.auto.tfvars    |  7 ++
 infra/tpu-pytorch-releases/iam.tf             | 32 ++++++
 infra/tpu-pytorch-releases/infra_triggers.tf  |  1 +
 infra/tpu-pytorch-releases/test_triggers.tf   | 50 ++++++++++
 infra/tpu-pytorch-releases/tpu_ci.tf          | 48 +++++++++
 infra/tpu-pytorch/iam.auto.tfvars             | 35 ++++++-
 infra/tpu-pytorch/infra_triggers.tf           |  2 +-
 infra/tpu-pytorch/tpu_ci.tf                   |  4 +-
 scripts/update_torch_wheels.sh                |  2 +-
 24 files changed, 552 insertions(+), 99 deletions(-)
 create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml
 create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/main.tf
 create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/variables.tf
 create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml
 create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/main.tf
 create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/variables.tf
 create mode 100644 infra/tpu-pytorch-releases/test_triggers.tf
 create mode 100644 infra/tpu-pytorch-releases/tpu_ci.tf

diff --git a/.github/ci.md b/.github/ci.md
index 69eb8ba969f6..4a262cec1c0f 100644
--- a/.github/ci.md
+++ b/.github/ci.md
@@ -57,7 +57,7 @@ For the C++ test groups in either case, the test binaries are pre-built during t
 
 The TPU CI runs only a subset of our tests due to capacity constraints, defined in `_tpu_ci.yml` `test/tpu/run_tests.sh`. The runners themselves are containers in GKE managed by [ARC](https://github.com/actions/actions-runner-controller). The container image is also based on our dev images, with some changes for ARC compatibility. The Dockerfile for this image lives in `test/tpu/Dockerfile`.
 
-The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch/tpu_ci.yml`.
+The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch-releases/tpu_ci.yml`.
 
 ### Reproducing test failures
 
@@ -95,7 +95,7 @@ If the TPU CI won't run, try to debug using the following steps:
 On your cloudtop:
 
 ```
-gcloud config set project tpu-pytorch
+gcloud config set project tpu-pytorch-releases
 gcloud container clusters get-credentials tpu-ci --location=us-central2
 ```
 
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
index f04c2c3b0999..97397807cebe 100644
--- a/.github/workflows/_tpu_ci.yml
+++ b/.github/workflows/_tpu_ci.yml
@@ -1,9 +1,18 @@
 name: TPU Integration Test
 on:
   workflow_call:
+    inputs:
+      tpu-version:
+        required: true
+        type: string
+      runner-label:
+        required: true
+        type: string
+
 jobs:
   tpu-test:
-    runs-on: v4-runner-set
+    # Use dynamic runner based on TPU version
+    runs-on: ${{ inputs.runner-label }}-runner-set
     steps:
       - name: Checkout actions
         uses: actions/checkout@v4
@@ -11,26 +20,29 @@ jobs:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
+      
       - name: Setup
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
           wheels-artifact: torch-xla-wheels
+      
       - name: Install test dependencies
         shell: bash
         run: |
-          # TODO: Add these in setup.py
           pip install --upgrade pip
           pip install fsspec
           pip install rich
-          # Jax nightly is needed for pallas tests.
+          # Jax nightly is needed for pallas tests
           pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html
           pip install --upgrade protobuf
+
       - name: Run Tests
         env:
           PJRT_DEVICE: TPU
           TPU_LOG_DIR: tpu_logs
+          TPU_VERSION: ${{ inputs.tpu-version }}
         run: |
           cd pytorch/xla
           test/tpu/run_tests.sh
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ce5e86db0623..bd9f41ad7185 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -104,6 +104,13 @@ jobs:
     uses: ./.github/workflows/_tpu_ci.yml
     needs: build-torch-xla
     if: github.event_name == 'push' || github.event_name == 'pull_request'
+    strategy:
+      matrix:
+        tpu-version: ['v4', 'v5p', 'v6e']
+      fail-fast: false  # Continue running other TPU versions if one fails
+    with:
+      tpu-version: ${{ matrix.tpu-version }}
+      runner-label: ${{ format('tpu-{0}', matrix.tpu-version) }}
 
   push-docs:
     name: "Build docs"
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
index e4e483947d9e..20f86d6654b7 100644
--- a/benchmarks/run_benchmark.sh
+++ b/benchmarks/run_benchmark.sh
@@ -35,7 +35,7 @@ shift $(($OPTIND - 1))
 
 # func for test after ssh to VM, create container and execute in container
 function benchmarking_in_container {
-  sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8
+  sudo docker pull gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8
   sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent    software-properties-common
   nvidia-smi
   distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
@@ -43,7 +43,7 @@ function benchmarking_in_container {
   curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
   sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
   sudo systemctl restart docker
-  sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash
+  sudo docker run --gpus all -it -d gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8 bin/bash
   sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
   # install torchbench
   cd ~
diff --git a/docker/cloudbuild.yaml b/docker/cloudbuild.yaml
index 7f53eed206b7..99663f837cb2 100644
--- a/docker/cloudbuild.yaml
+++ b/docker/cloudbuild.yaml
@@ -1,5 +1,5 @@
 # Cloud Build Configuration which:
-# (1) Builds, tests, and pushes gcr.io/tpu-pytorch/xla image
+# (1) Builds, tests, and pushes gcr.io/tpu-pytorch-releases/xla image
 # (2) Collects and stores torch and torch_xla wheels
 steps:
 - name: 'gcr.io/cloud-builders/docker'
@@ -16,20 +16,20 @@ steps:
           '--build-arg', 'cuda_compute=${_CUDA_COMPUTE}',
           '--build-arg', 'xla_branch=${_GITHUB_XLA_BRANCH}',
           '--build-arg', 'examle_branch=${_GITHUB_EXAMPLE_BRANCH}',
-          '-t', 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}',
+          '-t', 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}',
           '-f', 'docker/Dockerfile', '.'
         ]
   timeout: 14400s
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: bash
-  args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
-- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
+  args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
+- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
   entrypoint: bash
   args: ['-c', 'source /pytorch/xla/docker/common.sh && run_deployment_tests']
 - name: 'gcr.io/cloud-builders/docker'
-  args: ['push', '--all-tags', 'gcr.io/tpu-pytorch/xla']
+  args: ['push', '--all-tags', 'gcr.io/tpu-pytorch-releases/xla']
   timeout: 2700s
-- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
+- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
   entrypoint: 'bash'
   args: ['-c', 'source /pytorch/xla/docker/common.sh && collect_wheels ${_RELEASE_VERSION}']
 
@@ -48,12 +48,12 @@ substitutions:
     _GITHUB_EXAMPLE_BRANCH: 'master'
 options:
     pool:
-      name: 'projects/tpu-pytorch/locations/us-central1/workerPools/wheel_build'
+      name: 'projects/tpu-pytorch-releases/locations/us-central1/workerPools/wheel_build'
     dynamic_substitutions: true
     substitution_option: 'ALLOW_LOOSE'
 timeout: 32000s
 artifacts:
   objects:
     # CUDA wheels exported under `wheels/cuda/<cuda_version>`
-    location: 'gs://tpu-pytorch/wheels/$_UPLOAD_SUBDIR'
+    location: 'gs://tpu-pytorch-releases/wheels/$_UPLOAD_SUBDIR'
     paths: ['/**/*.whl']
diff --git a/docker/debug_cloudbuild.yaml b/docker/debug_cloudbuild.yaml
index f619821ac092..a90bc3e774f2 100644
--- a/docker/debug_cloudbuild.yaml
+++ b/docker/debug_cloudbuild.yaml
@@ -1,5 +1,5 @@
 # Cloud Build Configuration which:
-# Builds and pushes gcr.io/tpu-pytorch/xla_debug image.
+# Builds and pushes gcr.io/tpu-pytorch-releases/xla_debug image.
 steps:
 - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk:slim'
   args: ['bash', 'docker/debug_image_cleanup.sh']
@@ -11,16 +11,16 @@ steps:
           '--build-arg', 'python_version=${_PYTHON_VERSION}',
           '--build-arg', 'cloud_build=true',
           '--build-arg', 'release_version=${_RELEASE_VERSION}',
-          '-t', 'gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}',
-          '--cache-from', 'gcr.io/tpu-pytorch/xla_debug:nightly_3.6',
+          '-t', 'gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}',
+          '--cache-from', 'gcr.io/tpu-pytorch-releases/xla_debug:nightly_3.6',
           '-f', 'docker/Dockerfile', '.'
         ]
   timeout: 14400s
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: bash
-  args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
+  args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
 - name: 'gcr.io/cloud-builders/docker'
-  args: ['push', 'gcr.io/tpu-pytorch/xla_debug']
+  args: ['push', 'gcr.io/tpu-pytorch-releases/xla_debug']
   timeout: 1800s
 
 options:
diff --git a/docker/debug_image_cleanup.sh b/docker/debug_image_cleanup.sh
index 595d013b96d7..6296ea666aea 100644
--- a/docker/debug_image_cleanup.sh
+++ b/docker/debug_image_cleanup.sh
@@ -1,4 +1,4 @@
-IMAGE="gcr.io/tpu-pytorch/xla_debug"
+IMAGE="gcr.io/tpu-pytorch-releases/xla_debug"
 DATE=$(date --date='-90 days' +"%Y-%m-%dT%H:%M:%S")
 
 for digest in $(gcloud container images list-tags ${IMAGE} --limit=999999 --sort-by=TIMESTAMP --filter="timestamp.datetime < '${DATE}'" --format='get(digest)'); do
diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh
index 16184b416e35..1c5e8a6f3150 100755
--- a/docker/docker-entrypoint.sh
+++ b/docker/docker-entrypoint.sh
@@ -2,7 +2,7 @@
 
 # Explicitly source bashrc even when running commands directly.
 # Since commands run as a separate subshell, we need to source manually.
-# ex. docker run -it gcr.io/tpu-pytorch/xla:nightly bash ...
+# ex. docker run -it gcr.io/tpu-pytorch-releases/xla:nightly bash ...
 # The above will not source bashrc without entrypoint.
 source ~/.bashrc
 
diff --git a/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml
new file mode 100644
index 000000000000..38e6cc8f7a9f
--- /dev/null
+++ b/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml
@@ -0,0 +1,18 @@
+githubConfigUrl: ${github_repo_url}
+githubConfigSecret: github-pat
+minRunners: 1
+maxRunners: ${max_tpu_nodes}
+template:
+  spec:
+    containers:
+    - name: runner
+      image: ${runner_image}
+      command: ["/home/runner/run.sh"]
+      resources:
+        limits:
+          google.com/tpu: 4
+        requests:
+          google.com/tpu: 4
+    nodeSelector:
+      cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice
+      cloud.google.com/gke-tpu-topology: 2x2x1
diff --git a/infra/terraform_modules/arc_v5p_container_cluster/main.tf b/infra/terraform_modules/arc_v5p_container_cluster/main.tf
new file mode 100644
index 000000000000..58f730c645df
--- /dev/null
+++ b/infra/terraform_modules/arc_v5p_container_cluster/main.tf
@@ -0,0 +1,99 @@
+provider "google" {
+  project = var.project_id
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = "https://${google_container_cluster.arc_v5p_cluster.endpoint}"
+    token                  = data.google_client_config.default.access_token
+    cluster_ca_certificate = base64decode(google_container_cluster.arc_v5p_cluster.master_auth.0.cluster_ca_certificate)
+  }
+}
+
+data "google_client_config" "default" {}
+
+resource "google_container_cluster" "arc_v5p_cluster" {
+  name     = var.cluster_name
+  location = "us-central2"
+
+  remove_default_node_pool = true
+  initial_node_count       = 1
+
+  release_channel {
+    channel = "RAPID"
+  }
+
+  min_master_version = 1.28
+}
+
+resource "google_container_node_pool" "arc_v5p_cpu_nodes" {
+  name       = var.cpu_nodepool_name
+  location   = "us-central2"
+  cluster    = google_container_cluster.arc_v5p_cluster.name
+  node_count = var.cpu_node_count
+
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+  }
+
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "google_container_node_pool" "arc_v5p_tpu_nodes" {
+  name               = var.tpu_nodepool_name
+  location           = "us-central2"
+  node_locations     = ["us-central2-b"]
+  cluster            = google_container_cluster.arc_v5p_cluster.name
+  initial_node_count = 1
+  autoscaling {
+    total_min_node_count = 1
+    total_max_node_count = var.max_tpu_nodes
+    location_policy      = "ANY"
+  }
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+    machine_type = "ct5p-hightpu-4t"
+  }
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "helm_release" "arc" {
+  name             = "actions-runner-controller"
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller"
+  version          = "0.9.3"
+  namespace        = var.arc_namespace
+  create_namespace = true
+}
+
+resource "helm_release" "arc_runner_set" {
+  name = "v5p-runner-set"
+  depends_on = [
+    helm_release.arc
+  ]
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
+  version          = "0.9.3"
+  namespace        = var.runner_namespace
+  create_namespace = true
+
+  values = [
+    templatefile("../terraform_modules/arc_v5p_container_cluster/arc-values.yaml", {
+      github_repo_url = var.github_repo_url
+      max_tpu_nodes   = var.max_tpu_nodes
+      runner_image    = var.runner_image
+    })
+  ]
+}
diff --git a/infra/terraform_modules/arc_v5p_container_cluster/variables.tf b/infra/terraform_modules/arc_v5p_container_cluster/variables.tf
new file mode 100644
index 000000000000..648f461d4cde
--- /dev/null
+++ b/infra/terraform_modules/arc_v5p_container_cluster/variables.tf
@@ -0,0 +1,51 @@
+variable "cluster_name" {
+  description = "Name of the Container Cluster containing the v5p node pool"
+  type        = string
+}
+
+variable "cpu_nodepool_name" {
+  description = "Name of the CPU Nodepool"
+  type        = string
+}
+
+variable "cpu_node_count" {
+  description = "Number of CPU nodes"
+  type        = number
+}
+
+variable "tpu_nodepool_name" {
+  description = "Name of the TPU Nodepool"
+  type        = string
+}
+
+variable "max_tpu_nodes" {
+  description = "Maximum number of TPU nodes and runners"
+  type        = number
+}
+
+variable "arc_namespace" {
+  description = "The namespace where ARC will reside"
+  default     = "arc-systems"
+  type        = string
+}
+
+variable "runner_namespace" {
+  description = "The namespace where the ARC runners will reside"
+  default     = "arc-runners"
+  type        = string
+}
+
+variable "github_repo_url" {
+  description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC"
+  type        = string
+}
+
+variable "project_id" {
+  description = "The project ID"
+  type        = string
+}
+
+variable "runner_image" {
+  description = "The Docker image used in the self-hosted runner"
+  type        = string
+}
diff --git a/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml
new file mode 100644
index 000000000000..364ed6f2da05
--- /dev/null
+++ b/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml
@@ -0,0 +1,18 @@
+githubConfigUrl: ${github_repo_url}
+githubConfigSecret: github-pat
+minRunners: 1
+maxRunners: ${max_tpu_nodes}
+template:
+  spec:
+    containers:
+    - name: runner
+      image: ${runner_image}
+      command: ["/home/runner/run.sh"]
+      resources:
+        limits:
+          google.com/tpu: 4
+        requests:
+          google.com/tpu: 4
+    nodeSelector:
+      cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+      cloud.google.com/gke-tpu-topology: 2x2
diff --git a/infra/terraform_modules/arc_v6e_container_cluster/main.tf b/infra/terraform_modules/arc_v6e_container_cluster/main.tf
new file mode 100644
index 000000000000..98e2d943da92
--- /dev/null
+++ b/infra/terraform_modules/arc_v6e_container_cluster/main.tf
@@ -0,0 +1,99 @@
+provider "google" {
+  project = var.project_id
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = "https://${google_container_cluster.arc_v6e_cluster.endpoint}"
+    token                  = data.google_client_config.default.access_token
+    cluster_ca_certificate = base64decode(google_container_cluster.arc_v6e_cluster.master_auth.0.cluster_ca_certificate)
+  }
+}
+
+data "google_client_config" "default" {}
+
+resource "google_container_cluster" "arc_v6e_cluster" {
+  name     = var.cluster_name
+  location = "us-central2"
+
+  remove_default_node_pool = true
+  initial_node_count       = 1
+
+  release_channel {
+    channel = "RAPID"
+  }
+
+  min_master_version = 1.28
+}
+
+resource "google_container_node_pool" "arc_v6e_cpu_nodes" {
+  name       = var.cpu_nodepool_name
+  location   = "us-central2"
+  cluster    = google_container_cluster.arc_v6e_cluster.name
+  node_count = var.cpu_node_count
+
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+  }
+
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "google_container_node_pool" "arc_v6e_tpu_nodes" {
+  name               = var.tpu_nodepool_name
+  location           = "us-central2"
+  node_locations     = ["us-central2-b"]
+  cluster            = google_container_cluster.arc_v6e_cluster.name
+  initial_node_count = 1
+  autoscaling {
+    total_min_node_count = 1
+    total_max_node_count = var.max_tpu_nodes
+    location_policy      = "ANY"
+  }
+  node_config {
+    oauth_scopes = [
+      "https://www.googleapis.com/auth/logging.write",
+      "https://www.googleapis.com/auth/monitoring",
+      "https://www.googleapis.com/auth/devstorage.read_only",
+    ]
+    machine_type = "ct6e-standard-4t"
+  }
+  management {
+    auto_upgrade = true
+    auto_repair  = true
+  }
+}
+
+resource "helm_release" "arc" {
+  name             = "actions-runner-controller"
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller"
+  version          = "0.9.3"
+  namespace        = var.arc_namespace
+  create_namespace = true
+}
+
+resource "helm_release" "arc_runner_set" {
+  name = "v6e-runner-set"
+  depends_on = [
+    helm_release.arc
+  ]
+  chart            = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
+  version          = "0.9.3"
+  namespace        = var.runner_namespace
+  create_namespace = true
+
+  values = [
+    templatefile("../terraform_modules/arc_v6e_container_cluster/arc-values.yaml", {
+      github_repo_url = var.github_repo_url
+      max_tpu_nodes   = var.max_tpu_nodes
+      runner_image    = var.runner_image
+    })
+  ]
+}
diff --git a/infra/terraform_modules/arc_v6e_container_cluster/variables.tf b/infra/terraform_modules/arc_v6e_container_cluster/variables.tf
new file mode 100644
index 000000000000..e7d402314055
--- /dev/null
+++ b/infra/terraform_modules/arc_v6e_container_cluster/variables.tf
@@ -0,0 +1,51 @@
+variable "cluster_name" {
+  description = "Name of the Container Cluster containing the v6e node pool"
+  type        = string
+}
+
+variable "cpu_nodepool_name" {
+  description = "Name of the CPU Nodepool"
+  type        = string
+}
+
+variable "cpu_node_count" {
+  description = "Number of CPU nodes"
+  type        = number
+}
+
+variable "tpu_nodepool_name" {
+  description = "Name of the TPU Nodepool"
+  type        = string
+}
+
+variable "max_tpu_nodes" {
+  description = "Maximum number of TPU nodes and runners"
+  type        = number
+}
+
+variable "arc_namespace" {
+  description = "The namespace where ARC will reside"
+  default     = "arc-systems"
+  type        = string
+}
+
+variable "runner_namespace" {
+  description = "The namespace where the ARC runners will reside"
+  default     = "arc-runners"
+  type        = string
+}
+
+variable "github_repo_url" {
+  description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC"
+  type        = string
+}
+
+variable "project_id" {
+  description = "The project ID"
+  type        = string
+}
+
+variable "runner_image" {
+  description = "The Docker image used in the self-hosted runner"
+  type        = string
+}
diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf
index 3fc50a1ae662..0ad551452c9d 100644
--- a/infra/tpu-pytorch-releases/artifacts_builds.tf
+++ b/infra/tpu-pytorch-releases/artifacts_builds.tf
@@ -17,22 +17,6 @@ variable "nightly_builds" {
   default = []
 }
 
-// TODO: Remove this after the 2.1 release
-variable "xrt_versioned_builds" {
-  type = list(
-    object({
-      package_version = string
-      accelerator    = string
-      pytorch_git_rev = optional(string, "")
-      cuda_version   = optional(string, "11.8")
-      python_version = optional(string, "3.8")
-      arch           = optional(string, "amd64")
-    })
-  )
-
-  default = []
-}
-
 variable "versioned_builds" {
   type = list(
     object({
@@ -62,16 +46,6 @@ locals {
     ) => b
   }
 
-  // TODO: Remove this after the 2.1 release
-  xrt_versioned_builds_dict = {
-    for b in var.xrt_versioned_builds :
-    format("r%s_%s_%s",
-      replace(b.package_version, "+", "_"),
-      b.python_version,
-      b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version)
-    ) => b
-  }
-
   versioned_builds_dict = {
     for b in var.versioned_builds :
     format("r%s_%s_%s%s",
@@ -129,51 +103,6 @@ module "nightly_builds" {
   docker_repo_url         = module.docker_registry.url
 }
 
-// TODO: Remove this after the 2.1 release
-module "xrt_versioned_builds" {
-  source   = "../terraform_modules/xla_docker_build"
-  for_each = local.xrt_versioned_builds_dict
-
-  ansible_vars = merge(each.value, {
-    xla_git_rev     = "$COMMIT_SHA",
-    cxx11_abi       = each.value.cxx11_abi
-  })
-
-  trigger_on_schedule = { schedule = "0 0 * * *", branch = "xrt" }
-
-  trigger_name = replace(each.key, "/[_.]/", "-")
-  image_name   = "xla"
-  image_tags = [
-    each.key,
-    # Append _YYYYMMDD suffix to nightly image name.
-    "${each.key}_$(date +%Y%m%d)",
-  ]
-
-  description = join(" ", [
-    "Builds nightly xla:${each.key}' ${
-      each.value.accelerator == "tpu"
-      ? "TPU"
-      : format("CUDA %s", each.value.cuda_version)
-    } docker image and corresponding wheels for PyTorch/XLA.",
-    "Trigger managed by Terraform setup in",
-    "infra/tpu-pytorch-releases/artifacts_builds.tf."
-  ])
-
-  wheels_dest = "${module.releases_storage_bucket.url}/wheels/xrt/${
-    each.value.accelerator == "tpu"
-    ? "tpuvm"
-    : "cuda/${each.value.cuda_version}"
-  }"
-  wheels_srcs = ["/dist/*.whl"]
-  build_args = {
-    python_version = each.value.python_version
-  }
-
-  scheduler_account_email = module.scheduler_account.email
-  worker_pool_id          = module.worker_pool.id
-  docker_repo_url         = module.docker_registry.url
-}
-
 module "versioned_builds" {
   source   = "../terraform_modules/xla_docker_build"
   for_each = local.versioned_builds_dict
diff --git a/infra/tpu-pytorch-releases/iam.auto.tfvars b/infra/tpu-pytorch-releases/iam.auto.tfvars
index 10584c44df5a..a50da998a9b7 100644
--- a/infra/tpu-pytorch-releases/iam.auto.tfvars
+++ b/infra/tpu-pytorch-releases/iam.auto.tfvars
@@ -5,3 +5,10 @@ project_admins = [
 
 cloudbuild_editors = [
 ]
+
+project_remote_build_writers = [
+  "group:cloud-tpus-dev-team@twosync.google.com",
+  "user:pytorchxla-general@google.com",
+  # tpu-pytorch-releases project: default Service Account for running Cloud Build jobs.
+  "serviceAccount:1001674285173@cloudbuild.gserviceaccount.com"
+]
diff --git a/infra/tpu-pytorch-releases/iam.tf b/infra/tpu-pytorch-releases/iam.tf
index d300bfbec566..a6c78df9c2b0 100644
--- a/infra/tpu-pytorch-releases/iam.tf
+++ b/infra/tpu-pytorch-releases/iam.tf
@@ -33,3 +33,35 @@ resource "google_project_iam_member" "cloudbuild_editor" {
   role    = "roles/cloudbuild.builds.editor"
   member  = each.value
 }
+
+resource "google_project_iam_custom_role" "remote_bazel_role" {
+  role_id     = "remoteBuildWriterRole"
+  title       = "Remote Build Writer"
+  description = "For running remote bazel builds and read/write from remote cache on GCP."
+  stage       = "ALPHA"
+  permissions = [
+    "remotebuildexecution.actions.create",
+    "remotebuildexecution.actions.get",
+    "remotebuildexecution.actions.set",
+    "remotebuildexecution.blobs.create",
+    "remotebuildexecution.blobs.get",
+    "remotebuildexecution.logstreams.create",
+    "remotebuildexecution.logstreams.get",
+    "remotebuildexecution.logstreams.update",
+  ]
+}
+
+data "google_project" "project" {}
+
+variable "project_remote_build_writers" {
+  type    = list(string)
+  default = []
+}
+
+resource "google_project_iam_member" "project_remote_build_writers" {
+  for_each = toset(var.project_remote_build_writers)
+
+  project = data.google_project.project.project_id
+  role    = google_project_iam_custom_role.remote_bazel_role.id
+  member  = each.value
+}
diff --git a/infra/tpu-pytorch-releases/infra_triggers.tf b/infra/tpu-pytorch-releases/infra_triggers.tf
index 07e6b967ac89..b342b6d90cf7 100644
--- a/infra/tpu-pytorch-releases/infra_triggers.tf
+++ b/infra/tpu-pytorch-releases/infra_triggers.tf
@@ -6,4 +6,5 @@ module "terraform_apply" {
   config_directory = "infra/tpu-pytorch-releases"
 
   worker_pool_id = module.worker_pool.id
+  location       = "global"
 }
diff --git a/infra/tpu-pytorch-releases/test_triggers.tf b/infra/tpu-pytorch-releases/test_triggers.tf
new file mode 100644
index 000000000000..111e0a5cb290
--- /dev/null
+++ b/infra/tpu-pytorch-releases/test_triggers.tf
@@ -0,0 +1,50 @@
+module "tpu_e2e_tests" {
+  source = "../terraform_modules/xla_docker_build"
+
+  trigger_name = "ci-tpu-test-trigger"
+
+  trigger_on_push = {
+    branch = "master"
+    ignored_files = ["experimental/torch_xla2/**"]
+  }
+  run_e2e_tests   = true
+
+  image_name = "pytorch-xla-test"
+  image_tags = [
+    # $BUILD_ID is a GCB variable, not a bash variable.
+    # See https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_default_substitutions.
+    "$BUILD_ID",
+  ]
+  dockerfile = "e2e_tests.Dockerfile"
+  description = join(" ", [
+    "Run e2e TPU tests on an image built from master branch.",
+    "Trigger managed by Terraform setup in",
+    "infra/tpu-pytorch-releases/test_triggers.tf.",
+  ])
+
+  build_args = {
+    python_version = "3.10"
+  }
+
+  ansible_vars = {
+    arch            = "amd64"
+    accelerator     = "tpu"
+    pytorch_git_rev = "main"
+    # The commit ID associated with the triggered build. Substituted when
+    # Cloud Build is triggered.
+    xla_git_rev = "$COMMIT_SHA"
+    bundle_libtpu = "0"
+  }
+
+  # Substitutions used in the "run_e2e_tests" step, see
+  # infra/terraform_modules/xla_docker_build/xla_docker_build.tf.
+  substitutions = {
+    _CLUSTER_NAME = "tpu-cluster"
+    _CLUSTER_ZONE = "europe-west4-a"
+  }
+
+  docker_repo_url = module.docker_registry.url
+  worker_pool_id  = module.worker_pool.id
+  timeout_minutes = 4 * 60
+  location        = "global"
+}
diff --git a/infra/tpu-pytorch-releases/tpu_ci.tf b/infra/tpu-pytorch-releases/tpu_ci.tf
new file mode 100644
index 000000000000..e90141891b7d
--- /dev/null
+++ b/infra/tpu-pytorch-releases/tpu_ci.tf
@@ -0,0 +1,48 @@
+# This Terraform configuration manages CI/CD infrastructure for PyTorch/XLA testing
+# across multiple TPU hardware generations (v4, v5p, v6e). It creates:
+# - Separate GKE clusters for each TPU version
+# - Node pools with both CPU and TPU nodes
+# - GitHub Actions runner configuration for automated testing
+# - Custom CI runner container deployment
+# 
+# The infrastructure is used to run automated tests for the pytorch/xla repository
+# ensuring compatibility and performance across TPU generations.
+
+module "v4_arc_cluster" {
+  source            = "../terraform_modules/arc_v4_container_cluster"
+  project_id        = "tpu-pytorch-releases"
+  cluster_name      = "tpu-ci"
+  cpu_nodepool_name = "cpu-nodepool"
+  cpu_node_count    = 1
+  tpu_nodepool_name = "tpu-nodepool"
+  max_tpu_nodes     = 4
+  github_repo_url   = "https://github.com/pytorch/xla"
+  # Dockerfile for this image can be found at test/tpu/Dockerfile
+  runner_image      = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest"
+}
+
+module "v5p_arc_cluster" {
+  source            = "../terraform_modules/arc_v5p_container_cluster"
+  project_id        = "tpu-pytorch-releases"
+  cluster_name      = "tpu-ci"
+  cpu_nodepool_name = "cpu-nodepool"
+  cpu_node_count    = 1
+  tpu_nodepool_name = "tpu-nodepool"
+  max_tpu_nodes     = 4
+  github_repo_url   = "https://github.com/pytorch/xla"
+  # Dockerfile for this image can be found at test/tpu/Dockerfile
+  runner_image      = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest"
+}
+
+module "v6e_arc_cluster" {
+  source            = "../terraform_modules/arc_v6e_container_cluster"
+  project_id        = "tpu-pytorch-releases"
+  cluster_name      = "tpu-ci"
+  cpu_nodepool_name = "cpu-nodepool"
+  cpu_node_count    = 1
+  tpu_nodepool_name = "tpu-nodepool"
+  max_tpu_nodes     = 4
+  github_repo_url   = "https://github.com/pytorch/xla"
+  # Dockerfile for this image can be found at test/tpu/Dockerfile
+  runner_image      = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest"
+}
\ No newline at end of file
diff --git a/infra/tpu-pytorch/iam.auto.tfvars b/infra/tpu-pytorch/iam.auto.tfvars
index 958c108e2c74..169de15250df 100644
--- a/infra/tpu-pytorch/iam.auto.tfvars
+++ b/infra/tpu-pytorch/iam.auto.tfvars
@@ -1,7 +1,38 @@
 project_remote_build_writers = [
   "group:cloud-tpus-dev-team@twosync.google.com",
-  "user:mlewko@google.com",
-  "user:goranpetrovic@google.com",
+  "user:pytorchxla-general@google.com",
   # tpu-pytorch-releases project: default Service Account for running Cloud Build jobs.
   "serviceAccount:1001674285173@cloudbuild.gserviceaccount.com"
 ]
+
+cloudbuild_editors = [
+  "user:pytorchxla-general@google.com",
+]
+
+artifact_registry_administrators = [
+  "user:pytorchxla-general@google.com",
+]
+
+bigquery_admins = [
+  "user:pytorchxla-general@google.com",
+]
+
+compute_admins = [
+  "user:pytorchxla-general@google.com",
+]
+
+remote_bazel = [
+  "user:pytorchxla-general@google.com",
+]
+
+role_viewers = [
+  "user:pytorchxla-general@google.com",
+]
+
+storage_admins = [
+  "user:pytorchxla-general@google.com",
+]
+
+tpu_admins = [
+  "user:pytorchxla-general@google.com",
+]
diff --git a/infra/tpu-pytorch/infra_triggers.tf b/infra/tpu-pytorch/infra_triggers.tf
index 69770a5151aa..c772466ed30e 100644
--- a/infra/tpu-pytorch/infra_triggers.tf
+++ b/infra/tpu-pytorch/infra_triggers.tf
@@ -3,7 +3,7 @@ module "terraform_apply" {
 
   included_files    = ["infra/**"]
   branch           = "master"
-  config_directory = "infra/tpu-pytorch"
+  config_directory = "infra/tpu-pytorch-releases"
 
   worker_pool_id = module.worker_pool.id
   location       = "global"
diff --git a/infra/tpu-pytorch/tpu_ci.tf b/infra/tpu-pytorch/tpu_ci.tf
index cf2ae57a8139..64b350ef5f4c 100644
--- a/infra/tpu-pytorch/tpu_ci.tf
+++ b/infra/tpu-pytorch/tpu_ci.tf
@@ -1,6 +1,6 @@
 module "v4_arc_cluster" {
   source            = "../terraform_modules/arc_v4_container_cluster"
-  project_id        = "tpu-pytorch"
+  project_id        = "tpu-pytorch-releases"
   cluster_name      = "tpu-ci"
   cpu_nodepool_name = "cpu-nodepool"
   cpu_node_count    = 1
@@ -8,5 +8,5 @@ module "v4_arc_cluster" {
   max_tpu_nodes     = 2
   github_repo_url   = "https://github.com/pytorch/xla"
   # Dockerfile for this image can be found at test/tpu/Dockerfile
-  runner_image      = "gcr.io/tpu-pytorch/tpu-ci-runner:latest"
+  runner_image      = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest"
 }
diff --git a/scripts/update_torch_wheels.sh b/scripts/update_torch_wheels.sh
index 534f5d151170..b2622b2b4cf6 100755
--- a/scripts/update_torch_wheels.sh
+++ b/scripts/update_torch_wheels.sh
@@ -2,7 +2,7 @@
 set -e
 set -x
 
-DIST_BUCKET="gs://tpu-pytorch/wheels"
+DIST_BUCKET="gs://tpu-pytorch-releases/wheels"
 TORCH_WHEEL="torch-nightly-cp36-cp36m-linux_x86_64.whl"
 TORCH_XLA_WHEEL="torch_xla-nightly-cp36-cp36m-linux_x86_64.whl"
 TORCHVISION_WHEEL="torchvision-nightly-cp36-cp36m-linux_x86_64.whl"