Skip to content

Commit

Permalink
Port infra from tpu-pytorch to tpu-pytorch-releases and support v4, v…
Browse files Browse the repository at this point in the history
…5p, v6e CI run
  • Loading branch information
zpcore committed Jan 15, 2025
1 parent 4edbf61 commit 8940757
Show file tree
Hide file tree
Showing 24 changed files with 552 additions and 99 deletions.
4 changes: 2 additions & 2 deletions .github/ci.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ For the C++ test groups in either case, the test binaries are pre-built during t

The TPU CI runs only a subset of our tests due to capacity constraints, defined in `_tpu_ci.yml` `test/tpu/run_tests.sh`. The runners themselves are containers in GKE managed by [ARC](https://github.com/actions/actions-runner-controller). The container image is also based on our dev images, with some changes for ARC compatibility. The Dockerfile for this image lives in `test/tpu/Dockerfile`.

The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch/tpu_ci.yml`.
The actual ARC cluster is defined in Terraform at `infra/tpu-pytorch-releases/tpu_ci.yml`.

### Reproducing test failures

Expand Down Expand Up @@ -95,7 +95,7 @@ If the TPU CI won't run, try to debug using the following steps:
On your cloudtop:

```
gcloud config set project tpu-pytorch
gcloud config set project tpu-pytorch-releases
gcloud container clusters get-credentials tpu-ci --location=us-central2
```

Expand Down
18 changes: 15 additions & 3 deletions .github/workflows/_tpu_ci.yml
Original file line number Diff line number Diff line change
@@ -1,36 +1,48 @@
name: TPU Integration Test
on:
workflow_call:
inputs:
tpu-version:
required: true
type: string
runner-label:
required: true
type: string

jobs:
tpu-test:
runs-on: v4-runner-set
# Use dynamic runner based on TPU version
runs-on: ${{ inputs.runner-label }}-runner-set
steps:
- name: Checkout actions
uses: actions/checkout@v4
with:
sparse-checkout: |
.github/workflows/setup
path: .actions

- name: Setup
uses: ./.actions/.github/workflows/setup
with:
torch-commit: ${{ inputs.torch-commit }}
wheels-artifact: torch-xla-wheels

- name: Install test dependencies
shell: bash
run: |
# TODO: Add these in setup.py
pip install --upgrade pip
pip install fsspec
pip install rich
# Jax nightly is needed for pallas tests.
# Jax nightly is needed for pallas tests
pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html
pip install --upgrade protobuf
- name: Run Tests
env:
PJRT_DEVICE: TPU
TPU_LOG_DIR: tpu_logs
TPU_VERSION: ${{ inputs.tpu-version }}
run: |
cd pytorch/xla
test/tpu/run_tests.sh
7 changes: 7 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ jobs:
uses: ./.github/workflows/_tpu_ci.yml
needs: build-torch-xla
if: github.event_name == 'push' || github.event_name == 'pull_request'
strategy:
matrix:
tpu-version: ['v4', 'v5p', 'v6e']
fail-fast: false # Continue running other TPU versions if one fails
with:
tpu-version: ${{ matrix.tpu-version }}
runner-label: ${{ format('tpu-{0}', matrix.tpu-version) }}

push-docs:
name: "Build docs"
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ shift $(($OPTIND - 1))

# func for test after ssh to VM, create container and execute in container
function benchmarking_in_container {
sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8
sudo docker pull gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
nvidia-smi
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash
sudo docker run --gpus all -it -d gcr.io/tpu-pytorch-releases/xla:nightly_3.8_cuda_11.8 bin/bash
sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
# install torchbench
cd ~
Expand Down
16 changes: 8 additions & 8 deletions docker/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Cloud Build Configuration which:
# (1) Builds, tests, and pushes gcr.io/tpu-pytorch/xla image
# (1) Builds, tests, and pushes gcr.io/tpu-pytorch-releases/xla image
# (2) Collects and stores torch and torch_xla wheels
steps:
- name: 'gcr.io/cloud-builders/docker'
Expand All @@ -16,20 +16,20 @@ steps:
'--build-arg', 'cuda_compute=${_CUDA_COMPUTE}',
'--build-arg', 'xla_branch=${_GITHUB_XLA_BRANCH}',
'--build-arg', 'examle_branch=${_GITHUB_EXAMPLE_BRANCH}',
'-t', 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}',
'-t', 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}',
'-f', 'docker/Dockerfile', '.'
]
timeout: 14400s
- name: 'gcr.io/cloud-builders/docker'
entrypoint: bash
args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME} gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}_$(date -u +%Y%m%d)']
- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
entrypoint: bash
args: ['-c', 'source /pytorch/xla/docker/common.sh && run_deployment_tests']
- name: 'gcr.io/cloud-builders/docker'
args: ['push', '--all-tags', 'gcr.io/tpu-pytorch/xla']
args: ['push', '--all-tags', 'gcr.io/tpu-pytorch-releases/xla']
timeout: 2700s
- name: 'gcr.io/tpu-pytorch/xla:${_IMAGE_NAME}'
- name: 'gcr.io/tpu-pytorch-releases/xla:${_IMAGE_NAME}'
entrypoint: 'bash'
args: ['-c', 'source /pytorch/xla/docker/common.sh && collect_wheels ${_RELEASE_VERSION}']

Expand All @@ -48,12 +48,12 @@ substitutions:
_GITHUB_EXAMPLE_BRANCH: 'master'
options:
pool:
name: 'projects/tpu-pytorch/locations/us-central1/workerPools/wheel_build'
name: 'projects/tpu-pytorch-releases/locations/us-central1/workerPools/wheel_build'
dynamic_substitutions: true
substitution_option: 'ALLOW_LOOSE'
timeout: 32000s
artifacts:
objects:
# CUDA wheels exported under `wheels/cuda/<cuda_version>`
location: 'gs://tpu-pytorch/wheels/$_UPLOAD_SUBDIR'
location: 'gs://tpu-pytorch-releases/wheels/$_UPLOAD_SUBDIR'
paths: ['/**/*.whl']
10 changes: 5 additions & 5 deletions docker/debug_cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Cloud Build Configuration which:
# Builds and pushes gcr.io/tpu-pytorch/xla_debug image.
# Builds and pushes gcr.io/tpu-pytorch-releases/xla_debug image.
steps:
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk:slim'
args: ['bash', 'docker/debug_image_cleanup.sh']
Expand All @@ -11,16 +11,16 @@ steps:
'--build-arg', 'python_version=${_PYTHON_VERSION}',
'--build-arg', 'cloud_build=true',
'--build-arg', 'release_version=${_RELEASE_VERSION}',
'-t', 'gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}',
'--cache-from', 'gcr.io/tpu-pytorch/xla_debug:nightly_3.6',
'-t', 'gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}',
'--cache-from', 'gcr.io/tpu-pytorch-releases/xla_debug:nightly_3.6',
'-f', 'docker/Dockerfile', '.'
]
timeout: 14400s
- name: 'gcr.io/cloud-builders/docker'
entrypoint: bash
args: ['-c', 'docker tag gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
args: ['-c', 'docker tag gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME} gcr.io/tpu-pytorch-releases/xla_debug:${_TAG_NAME}_$(date -u +%Y%m%d_%H_%M)']
- name: 'gcr.io/cloud-builders/docker'
args: ['push', 'gcr.io/tpu-pytorch/xla_debug']
args: ['push', 'gcr.io/tpu-pytorch-releases/xla_debug']
timeout: 1800s

options:
Expand Down
2 changes: 1 addition & 1 deletion docker/debug_image_cleanup.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
IMAGE="gcr.io/tpu-pytorch/xla_debug"
IMAGE="gcr.io/tpu-pytorch-releases/xla_debug"
DATE=$(date --date='-90 days' +"%Y-%m-%dT%H:%M:%S")

for digest in $(gcloud container images list-tags ${IMAGE} --limit=999999 --sort-by=TIMESTAMP --filter="timestamp.datetime < '${DATE}'" --format='get(digest)'); do
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Explicitly source bashrc even when running commands directly.
# Since commands run as a separate subshell, we need to source manually.
# ex. docker run -it gcr.io/tpu-pytorch/xla:nightly bash ...
# ex. docker run -it gcr.io/tpu-pytorch-releases/xla:nightly bash ...
# The above will not source bashrc without entrypoint.
source ~/.bashrc

Expand Down
18 changes: 18 additions & 0 deletions infra/terraform_modules/arc_v5p_container_cluster/arc-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
githubConfigUrl: ${github_repo_url}
githubConfigSecret: github-pat
minRunners: 1
maxRunners: ${max_tpu_nodes}
template:
spec:
containers:
- name: runner
image: ${runner_image}
command: ["/home/runner/run.sh"]
resources:
limits:
google.com/tpu: 4
requests:
google.com/tpu: 4
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice
cloud.google.com/gke-tpu-topology: 2x2x1
99 changes: 99 additions & 0 deletions infra/terraform_modules/arc_v5p_container_cluster/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
provider "google" {
project = var.project_id
}

provider "helm" {
kubernetes {
host = "https://${google_container_cluster.arc_v5p_cluster.endpoint}"
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(google_container_cluster.arc_v5p_cluster.master_auth.0.cluster_ca_certificate)
}
}

data "google_client_config" "default" {}

resource "google_container_cluster" "arc_v5p_cluster" {
name = var.cluster_name
location = "us-central2"

remove_default_node_pool = true
initial_node_count = 1

release_channel {
channel = "RAPID"
}

min_master_version = 1.28
}

resource "google_container_node_pool" "arc_v5p_cpu_nodes" {
name = var.cpu_nodepool_name
location = "us-central2"
cluster = google_container_cluster.arc_v5p_cluster.name
node_count = var.cpu_node_count

node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
]
}

management {
auto_upgrade = true
auto_repair = true
}
}

resource "google_container_node_pool" "arc_v5p_tpu_nodes" {
name = var.tpu_nodepool_name
location = "us-central2"
node_locations = ["us-central2-b"]
cluster = google_container_cluster.arc_v5p_cluster.name
initial_node_count = 1
autoscaling {
total_min_node_count = 1
total_max_node_count = var.max_tpu_nodes
location_policy = "ANY"
}
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
]
machine_type = "ct5p-hightpu-4t"
}
management {
auto_upgrade = true
auto_repair = true
}
}

resource "helm_release" "arc" {
name = "actions-runner-controller"
chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller"
version = "0.9.3"
namespace = var.arc_namespace
create_namespace = true
}

resource "helm_release" "arc_runner_set" {
name = "v5p-runner-set"
depends_on = [
helm_release.arc
]
chart = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
version = "0.9.3"
namespace = var.runner_namespace
create_namespace = true

values = [
templatefile("../terraform_modules/arc_v5p_container_cluster/arc-values.yaml", {
github_repo_url = var.github_repo_url
max_tpu_nodes = var.max_tpu_nodes
runner_image = var.runner_image
})
]
}
51 changes: 51 additions & 0 deletions infra/terraform_modules/arc_v5p_container_cluster/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
variable "cluster_name" {
description = "Name of the Container Cluster containing the v5p node pool"
type = string
}

variable "cpu_nodepool_name" {
description = "Name of the CPU Nodepool"
type = string
}

variable "cpu_node_count" {
description = "Number of CPU nodes"
type = number
}

variable "tpu_nodepool_name" {
description = "Name of the TPU Nodepool"
type = string
}

variable "max_tpu_nodes" {
description = "Maximum number of TPU nodes and runners"
type = number
}

variable "arc_namespace" {
description = "The namespace where ARC will reside"
default = "arc-systems"
type = string
}

variable "runner_namespace" {
description = "The namespace where the ARC runners will reside"
default = "arc-runners"
type = string
}

variable "github_repo_url" {
description = "The full URL of the repository which will be utilizing the self-hosted runners in ARC"
type = string
}

variable "project_id" {
description = "The project ID"
type = string
}

variable "runner_image" {
description = "The Docker image used in the self-hosted runner"
type = string
}
18 changes: 18 additions & 0 deletions infra/terraform_modules/arc_v6e_container_cluster/arc-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
githubConfigUrl: ${github_repo_url}
githubConfigSecret: github-pat
minRunners: 1
maxRunners: ${max_tpu_nodes}
template:
spec:
containers:
- name: runner
image: ${runner_image}
command: ["/home/runner/run.sh"]
resources:
limits:
google.com/tpu: 4
requests:
google.com/tpu: 4
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 2x2
Loading

0 comments on commit 8940757

Please sign in to comment.