AI-Hypercomputer · gcie · Feb 26, 2025 · Feb 26, 2025
@@ -0,0 +1,78 @@
+# """
+# Copyright 2025 Google LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#      https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# """
+
+# This is a script to execute a nccl test. See https://github.com/NVIDIA/nccl-tests for more details
+
+#!/bin/bash
+
+set -x
+echo "Starting workload container for $NNODES benchmark"
+
+# Load all the cuda libs
+/sbin/ldconfig
+
+# Install ping
+apt update -y
+apt install -y iputils-ping
+
+# Start sshd
+/scripts/container_entry.sh daemon &
+
+# Get helper variables to form all hostnames
+export POSTFIX=$(hostname --fqdn | cut -d . -f 2-)
+export WORKERS_BASENAME=$(hostname --fqdn | cut -d . -f 1 | rev | cut -d - -f 2- | rev )
+export NODE_RANK=$JOB_COMPLETION_INDEX
+
+# For every worker, wait till online and add to hostfile
+for i in `seq 0 $(($NNODES-1))`; do
+  OTHER=${WORKERS_BASENAME}-${i}.${POSTFIX}
+  until ssh -p 222 -o StrictHostKeyChecking=no $OTHER hostname; do
+    echo Waiting for ${OTHER}...
+    sleep 10
+  done
+  echo ${OTHER} port=222 slots=8 | tee -a /tmp/hostfile;
+done
+
+cat /tmp/hostfile
+
+# Launch from head node
+if [[ "${NODE_RANK}" -eq "0" ]]; then
+
+    # World Level = 0x0, Rail Aligned = 0x7
+    export NCCL_TESTS_SPLIT_MASK="0x0";
+
+    export NCCL_LIB_DIR=$LD_LIBRARY_PATH
+
+    # Get all relevant NCCL / env vars to pass to all workers
+    ENV_VARS=$(echo ${!NCCL*} ${!OMPI*} LD_LIBRARY_PATH PATH | sed 's/ / -x /g')
+
+    mpirun --hostfile /tmp/hostfile \
+        -x $ENV_VARS \
+        --allow-run-as-root \
+        -np $(( GPU_PER_NODE * "${NNODES}" )) \
+        --mca orte_keep_fqdn_hostnames 1 \
+        --mca btl tcp,self \
+        --mca btl_tcp_if_include eth0 \
+        --mca plm_rsh_agent "ssh -q -o LogLevel=ERROR -o StrictHostKeyChecking=no -p 222" \
+        taskset -c 32-63 /scripts/demo_mpi_entry_with_config_profile.sh all_gather_perf \
+          -b 1K -e 8G -f 2 -g 1 -w 5 --iters 100
+else
+    while ping -c 1 ${WORKERS_BASENAME}-0.${POSTFIX}; do
+    sleep 5
+done
+fi
+
+exit 0
@@ -67,7 +67,7 @@ if [[ "${NODE_RANK}" -eq "0" ]]; then
     mpirun --hostfile /tmp/hostfile \
       -x $ENV_VARS  \
       --allow-run-as-root \
-      -mca plm_rsh_no_tree_spawn 1 \
+      --mca plm_rsh_no_tree_spawn 1 \
       --mca orte_keep_fqdn_hostnames 1 \
       --mca btl self,tcp \
       --mca btl_tcp_if_include eth0 \

@@ -2,13 +2,13 @@
 
 This document provides an introduction to running tests for the NVIDIA Collective Communications Library (NCCL). NCCL is a high-performance, multi-GPU communications library used in deep learning and other applications. The test suite helps verify the correct functionality and performance of NCCL on your system. Please visit [NCCL tests github](https://github.com/NVIDIA/nccl-tests?tab=readme-ov-file#nccl-tests) to learn more about NCCL and running it.
 
-Steps presented in this document are designed to run on A3 Ultra machines (`DEVICE_TYPE=h200-141gb-8`).
+Steps presented in this document are designed to run on A3 Ultra and A3 Mega machines (`DEVICE_TYPE=h200-141gb-8` or `DEVICE_TYPE=h100-mega-80gb-8`).
 
 ### 1. Create cluster
 
-Skip this step if you have already provisioned a GKE cluster with A3 Ultra machines.
+Skip this step if you have already provisioned a GKE cluster with A3 Ultra or A3 Mega machines.
 
-First step is to create a cluster with A3 Ultra machine. Execute command below:
+First step is to create a cluster with A3 Ultra or A3 Mega machine. Execute command below:
 
 ```
 python3 xpk.py cluster create \
@@ -19,11 +19,26 @@ python3 xpk.py cluster create \
 
 ### 2. Run NCCL workload
 
-To run NCCL tests on created cluster a workload will be submitted using xpk as follows:
+The command to run NCCL tests on A3 clusters depends on the type of machine.
 
+
+#### A3 Mega
+
+
+```bash
+python3 xpk.py workload create \
+    --workload=nccl-test --command="./examples/nccl/nccl-a3mega.sh" \
+    --base-docker-image=us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 \
+    --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
+    --zone=$COMPUTE_ZONE  --project=$PROJECT_ID \
+    --num-nodes=$WORKLOAD_NUM_NODES
 ```
+
+#### A3 Ultra
+
+```bash
 python3 xpk.py workload create \
-    --workload=nccl-test --command="./examples/nccl/nccl.sh" \
+    --workload=nccl-test --command="./examples/nccl/nccl-a3ultra.sh" \
     --base-docker-image=us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.0.3 \
     --cluster=$CLUSTER_NAME --device-type=$DEVICE_TYPE \
     --zone=$COMPUTE_ZONE  --project=$PROJECT_ID \