From 8940757e954037ba62596f04c54a7bf08c4c7381 Mon Sep 17 00:00:00 2001 From: zpcore Date: Wed, 15 Jan 2025 00:10:11 +0000 Subject: [PATCH] Port infra from tpu-pytorch to tpu-pytorch-releases and support v4, v5p, v6e CI run --- .github/ci.md | 4 +- .github/workflows/_tpu_ci.yml | 18 +++- .github/workflows/build_and_test.yml | 7 ++ benchmarks/run_benchmark.sh | 4 +- docker/cloudbuild.yaml | 16 +-- docker/debug_cloudbuild.yaml | 10 +- docker/debug_image_cleanup.sh | 2 +- docker/docker-entrypoint.sh | 2 +- .../arc_v5p_container_cluster/arc-values.yaml | 18 ++++ .../arc_v5p_container_cluster/main.tf | 99 +++++++++++++++++++ .../arc_v5p_container_cluster/variables.tf | 51 ++++++++++ .../arc_v6e_container_cluster/arc-values.yaml | 18 ++++ .../arc_v6e_container_cluster/main.tf | 99 +++++++++++++++++++ .../arc_v6e_container_cluster/variables.tf | 51 ++++++++++ .../tpu-pytorch-releases/artifacts_builds.tf | 71 ------------- infra/tpu-pytorch-releases/iam.auto.tfvars | 7 ++ infra/tpu-pytorch-releases/iam.tf | 32 ++++++ infra/tpu-pytorch-releases/infra_triggers.tf | 1 + infra/tpu-pytorch-releases/test_triggers.tf | 50 ++++++++++ infra/tpu-pytorch-releases/tpu_ci.tf | 48 +++++++++ infra/tpu-pytorch/iam.auto.tfvars | 35 ++++++- infra/tpu-pytorch/infra_triggers.tf | 2 +- infra/tpu-pytorch/tpu_ci.tf | 4 +- scripts/update_torch_wheels.sh | 2 +- 24 files changed, 552 insertions(+), 99 deletions(-) create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/main.tf create mode 100644 infra/terraform_modules/arc_v5p_container_cluster/variables.tf create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/main.tf create mode 100644 infra/terraform_modules/arc_v6e_container_cluster/variables.tf create mode 100644 infra/tpu-pytorch-releases/test_triggers.tf create mode 100644 infra/tpu-pytorch-releases/tpu_ci.tf diff --git a/.github/ci.md b/.github/ci.md index 69eb8ba969f6..4a262cec1c0f 100644 --- a/.github/ci.md +++ b/.github/ci.md @@ -57,7 +57,7 @@ For the C++ test groups in either case, the test binaries are pre-built during t The TPU CI runs only a subset of our tests due to capacity constraints, defined in `_tpu_ci.yml` `test/tpu/run_tests.sh`. The runners themselves are containers in GKE managed by [ARC](https://github.com/actions/actions-runner-controller). The container image is also based on our dev images, with some changes for ARC compatibility. The Dockerfile for this image lives in `test/tpu/Dockerfile`. -The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch/tpu_ci.yml`. +The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch-releases/tpu_ci.yml`. ### Reproducing test failures @@ -95,7 +95,7 @@ If the TPU CI won't run, try to debug using the following steps: On your cloudtop: ``` -gcloud config set project tpu-pytorch +gcloud config set project tpu-pytorch-releases gcloud container clusters get-credentials tpu-ci --location=us-central2 ``` diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml index f04c2c3b0999..97397807cebe 100644 --- a/.github/workflows/_tpu_ci.yml +++ b/.github/workflows/_tpu_ci.yml @@ -1,9 +1,18 @@ name: TPU Integration Test on: workflow_call: + inputs: + tpu-version: + required: true + type: string + runner-label: + required: true + type: string + jobs: tpu-test: - runs-on: v4-runner-set + # Use dynamic runner based on TPU version + runs-on: ${{ inputs.runner-label }}-runner-set steps: - name: Checkout actions uses: actions/checkout@v4 @@ -11,26 +20,29 @@ jobs: sparse-checkout: | .github/workflows/setup path: .actions + - name: Setup uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} wheels-artifact: torch-xla-wheels + - name: Install test dependencies shell: bash run: | - # TODO: Add these in setup.py pip install --upgrade pip pip install fsspec pip install rich - # Jax nightly is needed for pallas tests. + # Jax nightly is needed for pallas tests pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html pip install --upgrade protobuf + - name: Run Tests env: PJRT_DEVICE: TPU TPU_LOG_DIR: tpu_logs + TPU_VERSION: ${{ inputs.tpu-version }} run: | cd pytorch/xla test/tpu/run_tests.sh diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ce5e86db0623..bd9f41ad7185 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -104,6 +104,13 @@ jobs: uses: ./.github/workflows/_tpu_ci.yml needs: build-torch-xla if: github.event_name == 'push' || github.event_name == 'pull_request' + strategy: + matrix: + tpu-version: ['v4', 'v5p', 'v6e'] + fail-fast: false # Continue running other TPU versions if one fails + with: + tpu-version: ${{ matrix.tpu-version }} + runner-label: ${{ format('tpu-{0}', matrix.tpu-version) }} push-docs: name: "Build docs" diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh index e4e483947d9e..20f86d6654b7 100644 --- a/benchmarks/run_benchmark.sh +++ b/benchmarks/run_benchmark.sh @@ -35,7 +35,7 @@ shift $(($OPTIND - 1)) # func for test after ssh to VM, create container and execute in container function benchmarking_in_container { - sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 + sudo docker pull gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8 sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common nvidia-smi distribution=$(. /etc/os-release;echo $ID$VERSION_ID) @@ -43,7 +43,7 @@ function benchmarking_in_container { curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker - sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash + sudo docker run --gpus all -it -d gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8 bin/bash sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash # install torchbench cd ~ diff --git a/docker/cloudbuild.yaml b/docker/cloudbuild.yaml index 7f53eed206b7..99663f837cb2 100644 --- a/docker/cloudbuild.yaml +++ b/docker/cloudbuild.yaml @@ -1,5 +1,5 @@ # Cloud Build Configuration which: -# (1) Builds, tests, and pushes gcr.io/tpu-pytorch/xla image +# (1) Builds, tests, and pushes gcr.io/tpu-pytorch-releases/xla image # (2) Collects and stores torch and torch_xla wheels steps: - name: 'gcr.io/cloud-builders/docker' @@ -16,20 +16,20 @@ steps: '--build-arg', 'cuda_compute=${_CUDA_COMPUTE}', '--build-arg', 'xla_branch=${_GITHUB_XLA_BRANCH}', '--build-arg', 'examle_branch=${_GITHUB_EXAMPLE_BRANCH}', - '-t', 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}', + '-t', 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}', '-f', 'docker/Dockerfile', '.' ] timeout: 14400s - name: 'gcr.io/cloud-builders/docker' entrypoint: bash - args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)'] -- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}' + args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)'] +- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}' entrypoint: bash args: ['-c', 'source /pytorch/xla/docker/common.sh && run_deployment_tests'] - name: 'gcr.io/cloud-builders/docker' - args: ['push', '--all-tags', 'gcr.io/tpu-pytorch/xla'] + args: ['push', '--all-tags', 'gcr.io/tpu-pytorch-releases/xla'] timeout: 2700s -- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}' +- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}' entrypoint: 'bash' args: ['-c', 'source /pytorch/xla/docker/common.sh && collect_wheels ${_RELEASE_VERSION}'] @@ -48,12 +48,12 @@ substitutions: _GITHUB_EXAMPLE_BRANCH: 'master' options: pool: - name: 'projects/tpu-pytorch/locations/us-central1/workerPools/wheel_build' + name: 'projects/tpu-pytorch-releases/locations/us-central1/workerPools/wheel_build' dynamic_substitutions: true substitution_option: 'ALLOW_LOOSE' timeout: 32000s artifacts: objects: # CUDA wheels exported under `wheels/cuda/` - location: 'gs://tpu-pytorch/wheels/$_UPLOAD_SUBDIR' + location: 'gs://tpu-pytorch-releases/wheels/$_UPLOAD_SUBDIR' paths: ['/**/*.whl'] diff --git a/docker/debug_cloudbuild.yaml b/docker/debug_cloudbuild.yaml index f619821ac092..a90bc3e774f2 100644 --- a/docker/debug_cloudbuild.yaml +++ b/docker/debug_cloudbuild.yaml @@ -1,5 +1,5 @@ # Cloud Build Configuration which: -# Builds and pushes gcr.io/tpu-pytorch/xla_debug image. +# Builds and pushes gcr.io/tpu-pytorch-releases/xla_debug image. steps: - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk:slim' args: ['bash', 'docker/debug_image_cleanup.sh'] @@ -11,16 +11,16 @@ steps: '--build-arg', 'python_version=${_PYTHON_VERSION}', '--build-arg', 'cloud_build=true', '--build-arg', 'release_version=${_RELEASE_VERSION}', - '-t', 'gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}', - '--cache-from', 'gcr.io/tpu-pytorch/xla_debug:nightly_3.6', + '-t', 'gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}', + '--cache-from', 'gcr.io/tpu-pytorch-releases/xla_debug:nightly_3.6', '-f', 'docker/Dockerfile', '.' ] timeout: 14400s - name: 'gcr.io/cloud-builders/docker' entrypoint: bash - args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)'] + args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)'] - name: 'gcr.io/cloud-builders/docker' - args: ['push', 'gcr.io/tpu-pytorch/xla_debug'] + args: ['push', 'gcr.io/tpu-pytorch-releases/xla_debug'] timeout: 1800s options: diff --git a/docker/debug_image_cleanup.sh b/docker/debug_image_cleanup.sh index 595d013b96d7..6296ea666aea 100644 --- a/docker/debug_image_cleanup.sh +++ b/docker/debug_image_cleanup.sh @@ -1,4 +1,4 @@ -IMAGE="gcr.io/tpu-pytorch/xla_debug" +IMAGE="gcr.io/tpu-pytorch-releases/xla_debug" DATE=$(date --date='-90 days' +"%Y-%m-%dT%H:%M:%S") for digest in $(gcloud container images list-tags ${IMAGE} --limit=999999 --sort-by=TIMESTAMP --filter="timestamp.datetime < '${DATE}'" --format='get(digest)'); do diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index 16184b416e35..1c5e8a6f3150 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -2,7 +2,7 @@ # Explicitly source bashrc even when running commands directly. # Since commands run as a separate subshell, we need to source manually. -# ex. docker run -it gcr.io/tpu-pytorch/xla:nightly bash ... +# ex. docker run -it gcr.io/tpu-pytorch-releases/xla:nightly bash ... # The above will not source bashrc without entrypoint. source ~/.bashrc diff --git a/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml new file mode 100644 index 000000000000..38e6cc8f7a9f --- /dev/null +++ b/infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml @@ -0,0 +1,18 @@ +githubConfigUrl: ${github_repo_url} +githubConfigSecret: github-pat +minRunners: 1 +maxRunners: ${max_tpu_nodes} +template: + spec: + containers: + - name: runner + image: ${runner_image} + command: ["/home/runner/run.sh"] + resources: + limits: + google.com/tpu: 4 + requests: + google.com/tpu: 4 + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice + cloud.google.com/gke-tpu-topology: 2x2x1 diff --git a/infra/terraform_modules/arc_v5p_container_cluster/main.tf b/infra/terraform_modules/arc_v5p_container_cluster/main.tf new file mode 100644 index 000000000000..58f730c645df --- /dev/null +++ b/infra/terraform_modules/arc_v5p_container_cluster/main.tf @@ -0,0 +1,99 @@ +provider "google" { + project = var.project_id +} + +provider "helm" { + kubernetes { + host = "https://${google_container_cluster.arc_v5p_cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(google_container_cluster.arc_v5p_cluster.master_auth.0.cluster_ca_certificate) + } +} + +data "google_client_config" "default" {} + +resource "google_container_cluster" "arc_v5p_cluster" { + name = var.cluster_name + location = "us-central2" + + remove_default_node_pool = true + initial_node_count = 1 + + release_channel { + channel = "RAPID" + } + + min_master_version = 1.28 +} + +resource "google_container_node_pool" "arc_v5p_cpu_nodes" { + name = var.cpu_nodepool_name + location = "us-central2" + cluster = google_container_cluster.arc_v5p_cluster.name + node_count = var.cpu_node_count + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + } + + management { + auto_upgrade = true + auto_repair = true + } +} + +resource "google_container_node_pool" "arc_v5p_tpu_nodes" { + name = var.tpu_nodepool_name + location = "us-central2" + node_locations = ["us-central2-b"] + cluster = google_container_cluster.arc_v5p_cluster.name + initial_node_count = 1 + autoscaling { + total_min_node_count = 1 + total_max_node_count = var.max_tpu_nodes + location_policy = "ANY" + } + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + machine_type = "ct5p-hightpu-4t" + } + management { + auto_upgrade = true + auto_repair = true + } +} + +resource "helm_release" "arc" { + name = "actions-runner-controller" + chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller" + version = "0.9.3" + namespace = var.arc_namespace + create_namespace = true +} + +resource "helm_release" "arc_runner_set" { + name = "v5p-runner-set" + depends_on = [ + helm_release.arc + ] + chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set" + version = "0.9.3" + namespace = var.runner_namespace + create_namespace = true + + values = [ + templatefile("../terraform_modules/arc_v5p_container_cluster/arc-values.yaml", { + github_repo_url = var.github_repo_url + max_tpu_nodes = var.max_tpu_nodes + runner_image = var.runner_image + }) + ] +} diff --git a/infra/terraform_modules/arc_v5p_container_cluster/variables.tf b/infra/terraform_modules/arc_v5p_container_cluster/variables.tf new file mode 100644 index 000000000000..648f461d4cde --- /dev/null +++ b/infra/terraform_modules/arc_v5p_container_cluster/variables.tf @@ -0,0 +1,51 @@ +variable "cluster_name" { + description = "Name of the Container Cluster containing the v5p node pool" + type = string +} + +variable "cpu_nodepool_name" { + description = "Name of the CPU Nodepool" + type = string +} + +variable "cpu_node_count" { + description = "Number of CPU nodes" + type = number +} + +variable "tpu_nodepool_name" { + description = "Name of the TPU Nodepool" + type = string +} + +variable "max_tpu_nodes" { + description = "Maximum number of TPU nodes and runners" + type = number +} + +variable "arc_namespace" { + description = "The namespace where ARC will reside" + default = "arc-systems" + type = string +} + +variable "runner_namespace" { + description = "The namespace where the ARC runners will reside" + default = "arc-runners" + type = string +} + +variable "github_repo_url" { + description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC" + type = string +} + +variable "project_id" { + description = "The project ID" + type = string +} + +variable "runner_image" { + description = "The Docker image used in the self-hosted runner" + type = string +} diff --git a/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml b/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml new file mode 100644 index 000000000000..364ed6f2da05 --- /dev/null +++ b/infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml @@ -0,0 +1,18 @@ +githubConfigUrl: ${github_repo_url} +githubConfigSecret: github-pat +minRunners: 1 +maxRunners: ${max_tpu_nodes} +template: + spec: + containers: + - name: runner + image: ${runner_image} + command: ["/home/runner/run.sh"] + resources: + limits: + google.com/tpu: 4 + requests: + google.com/tpu: 4 + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice + cloud.google.com/gke-tpu-topology: 2x2 diff --git a/infra/terraform_modules/arc_v6e_container_cluster/main.tf b/infra/terraform_modules/arc_v6e_container_cluster/main.tf new file mode 100644 index 000000000000..98e2d943da92 --- /dev/null +++ b/infra/terraform_modules/arc_v6e_container_cluster/main.tf @@ -0,0 +1,99 @@ +provider "google" { + project = var.project_id +} + +provider "helm" { + kubernetes { + host = "https://${google_container_cluster.arc_v6e_cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(google_container_cluster.arc_v6e_cluster.master_auth.0.cluster_ca_certificate) + } +} + +data "google_client_config" "default" {} + +resource "google_container_cluster" "arc_v6e_cluster" { + name = var.cluster_name + location = "us-central2" + + remove_default_node_pool = true + initial_node_count = 1 + + release_channel { + channel = "RAPID" + } + + min_master_version = 1.28 +} + +resource "google_container_node_pool" "arc_v6e_cpu_nodes" { + name = var.cpu_nodepool_name + location = "us-central2" + cluster = google_container_cluster.arc_v6e_cluster.name + node_count = var.cpu_node_count + + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + } + + management { + auto_upgrade = true + auto_repair = true + } +} + +resource "google_container_node_pool" "arc_v6e_tpu_nodes" { + name = var.tpu_nodepool_name + location = "us-central2" + node_locations = ["us-central2-b"] + cluster = google_container_cluster.arc_v6e_cluster.name + initial_node_count = 1 + autoscaling { + total_min_node_count = 1 + total_max_node_count = var.max_tpu_nodes + location_policy = "ANY" + } + node_config { + oauth_scopes = [ + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + "https://www.googleapis.com/auth/devstorage.read_only", + ] + machine_type = "ct6e-standard-4t" + } + management { + auto_upgrade = true + auto_repair = true + } +} + +resource "helm_release" "arc" { + name = "actions-runner-controller" + chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller" + version = "0.9.3" + namespace = var.arc_namespace + create_namespace = true +} + +resource "helm_release" "arc_runner_set" { + name = "v6e-runner-set" + depends_on = [ + helm_release.arc + ] + chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set" + version = "0.9.3" + namespace = var.runner_namespace + create_namespace = true + + values = [ + templatefile("../terraform_modules/arc_v6e_container_cluster/arc-values.yaml", { + github_repo_url = var.github_repo_url + max_tpu_nodes = var.max_tpu_nodes + runner_image = var.runner_image + }) + ] +} diff --git a/infra/terraform_modules/arc_v6e_container_cluster/variables.tf b/infra/terraform_modules/arc_v6e_container_cluster/variables.tf new file mode 100644 index 000000000000..e7d402314055 --- /dev/null +++ b/infra/terraform_modules/arc_v6e_container_cluster/variables.tf @@ -0,0 +1,51 @@ +variable "cluster_name" { + description = "Name of the Container Cluster containing the v6e node pool" + type = string +} + +variable "cpu_nodepool_name" { + description = "Name of the CPU Nodepool" + type = string +} + +variable "cpu_node_count" { + description = "Number of CPU nodes" + type = number +} + +variable "tpu_nodepool_name" { + description = "Name of the TPU Nodepool" + type = string +} + +variable "max_tpu_nodes" { + description = "Maximum number of TPU nodes and runners" + type = number +} + +variable "arc_namespace" { + description = "The namespace where ARC will reside" + default = "arc-systems" + type = string +} + +variable "runner_namespace" { + description = "The namespace where the ARC runners will reside" + default = "arc-runners" + type = string +} + +variable "github_repo_url" { + description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC" + type = string +} + +variable "project_id" { + description = "The project ID" + type = string +} + +variable "runner_image" { + description = "The Docker image used in the self-hosted runner" + type = string +} diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf index 3fc50a1ae662..0ad551452c9d 100644 --- a/infra/tpu-pytorch-releases/artifacts_builds.tf +++ b/infra/tpu-pytorch-releases/artifacts_builds.tf @@ -17,22 +17,6 @@ variable "nightly_builds" { default = [] } -// TODO: Remove this after the 2.1 release -variable "xrt_versioned_builds" { - type = list( - object({ - package_version = string - accelerator = string - pytorch_git_rev = optional(string, "") - cuda_version = optional(string, "11.8") - python_version = optional(string, "3.8") - arch = optional(string, "amd64") - }) - ) - - default = [] -} - variable "versioned_builds" { type = list( object({ @@ -62,16 +46,6 @@ locals { ) => b } - // TODO: Remove this after the 2.1 release - xrt_versioned_builds_dict = { - for b in var.xrt_versioned_builds : - format("r%s_%s_%s", - replace(b.package_version, "+", "_"), - b.python_version, - b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version) - ) => b - } - versioned_builds_dict = { for b in var.versioned_builds : format("r%s_%s_%s%s", @@ -129,51 +103,6 @@ module "nightly_builds" { docker_repo_url = module.docker_registry.url } -// TODO: Remove this after the 2.1 release -module "xrt_versioned_builds" { - source = "../terraform_modules/xla_docker_build" - for_each = local.xrt_versioned_builds_dict - - ansible_vars = merge(each.value, { - xla_git_rev = "$COMMIT_SHA", - cxx11_abi = each.value.cxx11_abi - }) - - trigger_on_schedule = { schedule = "0 0 * * *", branch = "xrt" } - - trigger_name = replace(each.key, "/[_.]/", "-") - image_name = "xla" - image_tags = [ - each.key, - # Append _YYYYMMDD suffix to nightly image name. - "${each.key}_$(date +%Y%m%d)", - ] - - description = join(" ", [ - "Builds nightly xla:${each.key}' ${ - each.value.accelerator == "tpu" - ? "TPU" - : format("CUDA %s", each.value.cuda_version) - } docker image and corresponding wheels for PyTorch/XLA.", - "Trigger managed by Terraform setup in", - "infra/tpu-pytorch-releases/artifacts_builds.tf." - ]) - - wheels_dest = "${module.releases_storage_bucket.url}/wheels/xrt/${ - each.value.accelerator == "tpu" - ? "tpuvm" - : "cuda/${each.value.cuda_version}" - }" - wheels_srcs = ["/dist/*.whl"] - build_args = { - python_version = each.value.python_version - } - - scheduler_account_email = module.scheduler_account.email - worker_pool_id = module.worker_pool.id - docker_repo_url = module.docker_registry.url -} - module "versioned_builds" { source = "../terraform_modules/xla_docker_build" for_each = local.versioned_builds_dict diff --git a/infra/tpu-pytorch-releases/iam.auto.tfvars b/infra/tpu-pytorch-releases/iam.auto.tfvars index 10584c44df5a..a50da998a9b7 100644 --- a/infra/tpu-pytorch-releases/iam.auto.tfvars +++ b/infra/tpu-pytorch-releases/iam.auto.tfvars @@ -5,3 +5,10 @@ project_admins = [ cloudbuild_editors = [ ] + +project_remote_build_writers = [ + "group:cloud-tpus-dev-team@twosync.google.com", + "user:pytorchxla-general@google.com", + # tpu-pytorch-releases project: default Service Account for running Cloud Build jobs. + "serviceAccount:1001674285173@cloudbuild.gserviceaccount.com" +] diff --git a/infra/tpu-pytorch-releases/iam.tf b/infra/tpu-pytorch-releases/iam.tf index d300bfbec566..a6c78df9c2b0 100644 --- a/infra/tpu-pytorch-releases/iam.tf +++ b/infra/tpu-pytorch-releases/iam.tf @@ -33,3 +33,35 @@ resource "google_project_iam_member" "cloudbuild_editor" { role = "roles/cloudbuild.builds.editor" member = each.value } + +resource "google_project_iam_custom_role" "remote_bazel_role" { + role_id = "remoteBuildWriterRole" + title = "Remote Build Writer" + description = "For running remote bazel builds and read/write from remote cache on GCP." + stage = "ALPHA" + permissions = [ + "remotebuildexecution.actions.create", + "remotebuildexecution.actions.get", + "remotebuildexecution.actions.set", + "remotebuildexecution.blobs.create", + "remotebuildexecution.blobs.get", + "remotebuildexecution.logstreams.create", + "remotebuildexecution.logstreams.get", + "remotebuildexecution.logstreams.update", + ] +} + +data "google_project" "project" {} + +variable "project_remote_build_writers" { + type = list(string) + default = [] +} + +resource "google_project_iam_member" "project_remote_build_writers" { + for_each = toset(var.project_remote_build_writers) + + project = data.google_project.project.project_id + role = google_project_iam_custom_role.remote_bazel_role.id + member = each.value +} diff --git a/infra/tpu-pytorch-releases/infra_triggers.tf b/infra/tpu-pytorch-releases/infra_triggers.tf index 07e6b967ac89..b342b6d90cf7 100644 --- a/infra/tpu-pytorch-releases/infra_triggers.tf +++ b/infra/tpu-pytorch-releases/infra_triggers.tf @@ -6,4 +6,5 @@ module "terraform_apply" { config_directory = "infra/tpu-pytorch-releases" worker_pool_id = module.worker_pool.id + location = "global" } diff --git a/infra/tpu-pytorch-releases/test_triggers.tf b/infra/tpu-pytorch-releases/test_triggers.tf new file mode 100644 index 000000000000..111e0a5cb290 --- /dev/null +++ b/infra/tpu-pytorch-releases/test_triggers.tf @@ -0,0 +1,50 @@ +module "tpu_e2e_tests" { + source = "../terraform_modules/xla_docker_build" + + trigger_name = "ci-tpu-test-trigger" + + trigger_on_push = { + branch = "master" + ignored_files = ["experimental/torch_xla2/**"] + } + run_e2e_tests = true + + image_name = "pytorch-xla-test" + image_tags = [ + # $BUILD_ID is a GCB variable, not a bash variable. + # See https://cloud.google.com/build/docs/configuring-builds/substitute-variable-values#using_default_substitutions. + "$BUILD_ID", + ] + dockerfile = "e2e_tests.Dockerfile" + description = join(" ", [ + "Run e2e TPU tests on an image built from master branch.", + "Trigger managed by Terraform setup in", + "infra/tpu-pytorch-releases/test_triggers.tf.", + ]) + + build_args = { + python_version = "3.10" + } + + ansible_vars = { + arch = "amd64" + accelerator = "tpu" + pytorch_git_rev = "main" + # The commit ID associated with the triggered build. Substituted when + # Cloud Build is triggered. + xla_git_rev = "$COMMIT_SHA" + bundle_libtpu = "0" + } + + # Substitutions used in the "run_e2e_tests" step, see + # infra/terraform_modules/xla_docker_build/xla_docker_build.tf. + substitutions = { + _CLUSTER_NAME = "tpu-cluster" + _CLUSTER_ZONE = "europe-west4-a" + } + + docker_repo_url = module.docker_registry.url + worker_pool_id = module.worker_pool.id + timeout_minutes = 4 * 60 + location = "global" +} diff --git a/infra/tpu-pytorch-releases/tpu_ci.tf b/infra/tpu-pytorch-releases/tpu_ci.tf new file mode 100644 index 000000000000..e90141891b7d --- /dev/null +++ b/infra/tpu-pytorch-releases/tpu_ci.tf @@ -0,0 +1,48 @@ +# This Terraform configuration manages CI/CD infrastructure for PyTorch/XLA testing +# across multiple TPU hardware generations (v4, v5p, v6e). It creates: +# - Separate GKE clusters for each TPU version +# - Node pools with both CPU and TPU nodes +# - GitHub Actions runner configuration for automated testing +# - Custom CI runner container deployment +# +# The infrastructure is used to run automated tests for the pytorch/xla repository +# ensuring compatibility and performance across TPU generations. + +module "v4_arc_cluster" { + source = "../terraform_modules/arc_v4_container_cluster" + project_id = "tpu-pytorch-releases" + cluster_name = "tpu-ci" + cpu_nodepool_name = "cpu-nodepool" + cpu_node_count = 1 + tpu_nodepool_name = "tpu-nodepool" + max_tpu_nodes = 4 + github_repo_url = "https://github.com/pytorch/xla" + # Dockerfile for this image can be found at test/tpu/Dockerfile + runner_image = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest" +} + +module "v5p_arc_cluster" { + source = "../terraform_modules/arc_v5p_container_cluster" + project_id = "tpu-pytorch-releases" + cluster_name = "tpu-ci" + cpu_nodepool_name = "cpu-nodepool" + cpu_node_count = 1 + tpu_nodepool_name = "tpu-nodepool" + max_tpu_nodes = 4 + github_repo_url = "https://github.com/pytorch/xla" + # Dockerfile for this image can be found at test/tpu/Dockerfile + runner_image = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest" +} + +module "v6e_arc_cluster" { + source = "../terraform_modules/arc_v6e_container_cluster" + project_id = "tpu-pytorch-releases" + cluster_name = "tpu-ci" + cpu_nodepool_name = "cpu-nodepool" + cpu_node_count = 1 + tpu_nodepool_name = "tpu-nodepool" + max_tpu_nodes = 4 + github_repo_url = "https://github.com/pytorch/xla" + # Dockerfile for this image can be found at test/tpu/Dockerfile + runner_image = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest" +} \ No newline at end of file diff --git a/infra/tpu-pytorch/iam.auto.tfvars b/infra/tpu-pytorch/iam.auto.tfvars index 958c108e2c74..169de15250df 100644 --- a/infra/tpu-pytorch/iam.auto.tfvars +++ b/infra/tpu-pytorch/iam.auto.tfvars @@ -1,7 +1,38 @@ project_remote_build_writers = [ "group:cloud-tpus-dev-team@twosync.google.com", - "user:mlewko@google.com", - "user:goranpetrovic@google.com", + "user:pytorchxla-general@google.com", # tpu-pytorch-releases project: default Service Account for running Cloud Build jobs. "serviceAccount:1001674285173@cloudbuild.gserviceaccount.com" ] + +cloudbuild_editors = [ + "user:pytorchxla-general@google.com", +] + +artifact_registry_administrators = [ + "user:pytorchxla-general@google.com", +] + +bigquery_admins = [ + "user:pytorchxla-general@google.com", +] + +compute_admins = [ + "user:pytorchxla-general@google.com", +] + +remote_bazel = [ + "user:pytorchxla-general@google.com", +] + +role_viewers = [ + "user:pytorchxla-general@google.com", +] + +storage_admins = [ + "user:pytorchxla-general@google.com", +] + +tpu_admins = [ + "user:pytorchxla-general@google.com", +] diff --git a/infra/tpu-pytorch/infra_triggers.tf b/infra/tpu-pytorch/infra_triggers.tf index 69770a5151aa..c772466ed30e 100644 --- a/infra/tpu-pytorch/infra_triggers.tf +++ b/infra/tpu-pytorch/infra_triggers.tf @@ -3,7 +3,7 @@ module "terraform_apply" { included_files = ["infra/**"] branch = "master" - config_directory = "infra/tpu-pytorch" + config_directory = "infra/tpu-pytorch-releases" worker_pool_id = module.worker_pool.id location = "global" diff --git a/infra/tpu-pytorch/tpu_ci.tf b/infra/tpu-pytorch/tpu_ci.tf index cf2ae57a8139..64b350ef5f4c 100644 --- a/infra/tpu-pytorch/tpu_ci.tf +++ b/infra/tpu-pytorch/tpu_ci.tf @@ -1,6 +1,6 @@ module "v4_arc_cluster" { source = "../terraform_modules/arc_v4_container_cluster" - project_id = "tpu-pytorch" + project_id = "tpu-pytorch-releases" cluster_name = "tpu-ci" cpu_nodepool_name = "cpu-nodepool" cpu_node_count = 1 @@ -8,5 +8,5 @@ module "v4_arc_cluster" { max_tpu_nodes = 2 github_repo_url = "https://github.com/pytorch/xla" # Dockerfile for this image can be found at test/tpu/Dockerfile - runner_image = "gcr.io/tpu-pytorch/tpu-ci-runner:latest" + runner_image = "gcr.io/tpu-pytorch-releases/tpu-ci-runner:latest" } diff --git a/scripts/update_torch_wheels.sh b/scripts/update_torch_wheels.sh index 534f5d151170..b2622b2b4cf6 100755 --- a/scripts/update_torch_wheels.sh +++ b/scripts/update_torch_wheels.sh @@ -2,7 +2,7 @@ set -e set -x -DIST_BUCKET="gs://tpu-pytorch/wheels" +DIST_BUCKET="gs://tpu-pytorch-releases/wheels" TORCH_WHEEL="torch-nightly-cp36-cp36m-linux_x86_64.whl" TORCH_XLA_WHEEL="torch_xla-nightly-cp36-cp36m-linux_x86_64.whl" TORCHVISION_WHEEL="torchvision-nightly-cp36-cp36m-linux_x86_64.whl"