From 84d67bfb21eb4918fd714ea94cc09524439fba6a Mon Sep 17 00:00:00 2001 From: gcie Date: Wed, 26 Feb 2025 15:19:33 +0000 Subject: [PATCH] add A3 Mega NCCL tests --- examples/nccl/nccl-a3mega.sh | 78 ++++++++++++++++++++++ examples/nccl/{nccl.sh => nccl-a3ultra.sh} | 2 +- examples/nccl/nccl.md | 25 +++++-- 3 files changed, 99 insertions(+), 6 deletions(-) create mode 100755 examples/nccl/nccl-a3mega.sh rename examples/nccl/{nccl.sh => nccl-a3ultra.sh} (98%) diff --git a/examples/nccl/nccl-a3mega.sh b/examples/nccl/nccl-a3mega.sh new file mode 100755 index 00000000..8bcf5d1a --- /dev/null +++ b/examples/nccl/nccl-a3mega.sh @@ -0,0 +1,78 @@ +# """ +# Copyright 2025 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# """ + +# This is a script to execute a nccl test. See https://github.com/NVIDIA/nccl-tests for more details + +#!/bin/bash + +set -x +echo "Starting workload container for $NNODES benchmark" + +# Load all the cuda libs +/sbin/ldconfig + +# Install ping +apt update -y +apt install -y iputils-ping + +# Start sshd +/scripts/container_entry.sh daemon & + +# Get helper variables to form all hostnames +export POSTFIX=$(hostname --fqdn | cut -d . -f 2-) +export WORKERS_BASENAME=$(hostname --fqdn | cut -d . -f 1 | rev | cut -d - -f 2- | rev ) +export NODE_RANK=$JOB_COMPLETION_INDEX + +# For every worker, wait till online and add to hostfile +for i in `seq 0 $(($NNODES-1))`; do + OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX} + until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do + echo Waiting for ${OTHER}... + sleep 10 + done + echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile; +done + +cat /tmp/hostfile + +# Launch from head node +if [[ "${NODE_RANK}" -eq "0" ]]; then + + # World Level = 0x0, Rail Aligned = 0x7 + export NCCL_TESTS_SPLIT_MASK="0x0"; + + export NCCL_LIB_DIR=$LD_LIBRARY_PATH + + # Get all relevant NCCL / env vars to pass to all workers + ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g') + + mpirun --hostfile /tmp/hostfile \ + -x $ENV_VARS \ + --allow-run-as-root \ + -np $(( GPU_PER_NODE * "${NNODES}" )) \ + --mca orte_keep_fqdn_hostnames 1 \ + --mca btl tcp,self \ + --mca btl_tcp_if_include eth0 \ + --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \ + taskset -c 32-63 /scripts/demo_mpi_entry_with_config_profile.sh all_gather_perf \ + -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100 +else + while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do + sleep 5 +done +fi + +exit 0 \ No newline at end of file diff --git a/examples/nccl/nccl.sh b/examples/nccl/nccl-a3ultra.sh similarity index 98% rename from examples/nccl/nccl.sh rename to examples/nccl/nccl-a3ultra.sh index 873c602c..e35b1a5e 100755 --- a/examples/nccl/nccl.sh +++ b/examples/nccl/nccl-a3ultra.sh @@ -67,7 +67,7 @@ if [[ "${NODE_RANK}" -eq "0" ]]; then mpirun --hostfile /tmp/hostfile \ -x $ENV_VARS \ --allow-run-as-root \ - -mca plm_rsh_no_tree_spawn 1 \ + --mca plm_rsh_no_tree_spawn 1 \ --mca orte_keep_fqdn_hostnames 1 \ --mca btl self,tcp \ --mca btl_tcp_if_include eth0 \ diff --git a/examples/nccl/nccl.md b/examples/nccl/nccl.md index 18f31559..fc4fe5c5 100644 --- a/examples/nccl/nccl.md +++ b/examples/nccl/nccl.md @@ -2,13 +2,13 @@ This document provides an introduction to running tests for the NVIDIA Collective Communications Library (NCCL). NCCL is a high-performance, multi-GPU communications library used in deep learning and other applications. The test suite helps verify the correct functionality and performance of NCCL on your system. Please visit [NCCL tests github](https://github.com/NVIDIA/nccl-tests?tab=readme-ov-file#nccl-tests) to learn more about NCCL and running it. -Steps presented in this document are designed to run on A3 Ultra machines (`DEVICE_TYPE=h200-141gb-8`). +Steps presented in this document are designed to run on A3 Ultra and A3 Mega machines (`DEVICE_TYPE=h200-141gb-8` or `DEVICE_TYPE=h100-mega-80gb-8`). ### 1. Create cluster -Skip this step if you have already provisioned a GKE cluster with A3 Ultra machines. +Skip this step if you have already provisioned a GKE cluster with A3 Ultra or A3 Mega machines. -First step is to create a cluster with A3 Ultra machine. Execute command below: +First step is to create a cluster with A3 Ultra or A3 Mega machine. Execute command below: ``` python3 xpk.py cluster create \ @@ -19,11 +19,26 @@ python3 xpk.py cluster create \ ### 2. Run NCCL workload -To run NCCL tests on created cluster a workload will be submitted using xpk as follows: +The command to run NCCL tests on A3 clusters depends on the type of machine. + +#### A3 Mega + + +```bash +python3 xpk.py workload create \ + --workload=nccl-test --command="./examples/nccl/nccl-a3mega.sh" \ + --base-docker-image=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 \ + --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \ + --zone=$COMPUTE_ZONE --project=$PROJECT_ID \ + --num-nodes=$WORKLOAD_NUM_NODES ``` + +#### A3 Ultra + +```bash python3 xpk.py workload create \ - --workload=nccl-test --command="./examples/nccl/nccl.sh" \ + --workload=nccl-test --command="./examples/nccl/nccl-a3ultra.sh" \ --base-docker-image=us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3 \ --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \ --zone=$COMPUTE_ZONE --project=$PROJECT_ID \