Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Framework: Speed up CUDA PR builds/tests #11391

Merged
merged 38 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
fdaf8ef
Use autotester # to drive which GPU is used
sebrowne Dec 8, 2022
d6c4f39
Start adding testing for gpu_utils
sebrowne Dec 8, 2022
9d7ef5b
Call functions with parentheses
sebrowne Dec 14, 2022
4ee242f
Fix error in case that no nvidia-smi exists
sebrowne Dec 15, 2022
58a5058
Use strings for GPU IDs
sebrowne Dec 15, 2022
78d13d2
Correct cuda detection
sebrowne Dec 14, 2022
39610bc
Remove build_name changes from LaunchDriver.py
sebrowne Dec 14, 2022
e00fdbe
Use autotester # to drive which GPU is used
sebrowne Dec 8, 2022
58c407f
Start adding testing for gpu_utils
sebrowne Dec 8, 2022
b62701c
Call functions with parentheses
sebrowne Dec 14, 2022
16b5bbb
Fix error in case that no nvidia-smi exists
sebrowne Dec 15, 2022
66a3b88
Use strings for GPU IDs
sebrowne Dec 15, 2022
524f675
Correct mock return type
sebrowne Dec 15, 2022
bf4e190
More test fixes
sebrowne Dec 15, 2022
7caff01
Only use 1 slot per GPU
sebrowne Dec 16, 2022
11ab954
Revert "Only use 1 slot per GPU"
sebrowne Dec 17, 2022
b158376
Remove resource directives from RDC
sebrowne Mar 15, 2023
a9fb1be
Increase testing parallelism to 16
sebrowne Mar 15, 2023
b42f0ef
Merge branch 'develop' into sebrown/TRILFRAME-522
sebrowne Jun 12, 2023
2683c81
Re-disable TriBITS GPU specifications
sebrowne Jun 12, 2023
e40c2f6
Merge branch 'develop' into sebrown/autotester_gpu_assignment
sebrowne Oct 26, 2023
3d009c1
Use resource spec writing from TriBITS
sebrowne Oct 26, 2023
e938026
Merge branch 'sebrown/autotester_gpu_assignment' into sebrown/TRILFRA…
sebrowne Oct 26, 2023
4baac0a
Merge branch 'develop' into sebrown/TRILFRAME-522
sebrowne Oct 26, 2023
9912c63
test repo access
achauphan Nov 15, 2023
98b40cf
Fix appended cmake list to be the expected semicolon delimiter
achauphan Nov 15, 2023
8b173f1
Removed unnecessary quotes within cmake list of extra configure args
achauphan Nov 17, 2023
c4f316c
Forced Kokkos_CoreUnitTest_Cuda1_MPI_1 test to run serially on GPUs
achauphan Nov 22, 2023
43cf95a
Changed more tests to run serially
achauphan Nov 29, 2023
2b0aa61
Reverted previous change for Adelus test to run serially
Dec 6, 2023
f242883
Reverted previous change for Adelus test to run serially
achauphan Dec 6, 2023
a82c52e
Correct some bad unit test logic
sebrowne Dec 11, 2023
2ae8f39
Merge branch 'sebrown/TRILFRAME-522' of https://github.com/sebrowne/T…
Dec 11, 2023
a5cfc61
Merge branch 'develop' into sebrown/TRILFRAME-522
Dec 12, 2023
ad671d6
Reverted Phalanx ViewOfViews test RUN_SERIAL designation on CUDA builds
achauphan Dec 12, 2023
fb8f3e1
Merge branch 'develop' into sebrown/TRILFRAME-522
achauphan Dec 13, 2023
fcc92db
Disabled Adelus_vector_random_npr3_rhs1_MPI_3 test as it was unstable.
achauphan Dec 14, 2023
89679a8
Merge branch 'sebrown/TRILFRAME-522' of github.com:sebrowne/Trilinos …
achauphan Dec 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions packages/framework/ini-files/config-specs.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2305,6 +2305,14 @@ opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_MPI_1_DISABLE BOOL : ON
# MPI issue (TRILFRAME-552)
opt-set-cmake-var ROL_example_PinT_parabolic-control_AugmentedSystem_test_MPI_2_DISABLE BOOL FORCE : ON

# This is run serially to try to prevent some unpredictable issues where these tests may be trying
# overlap other executing tests on the same GPU after introducing usage for multiple GPUs for testing.
# (https://github.com/trilinos/Trilinos/pull/11391)
opt-set-cmake-var Kokkos_CoreUnitTest_Cuda1_MPI_1_RUN_SERIAL BOOL FORCE : ON
# Test has been unstable, disabling with Sam B's approval.i
# (https://github.com/trilinos/Trilinos/pull/11391)
opt-set-cmake-var Adelus_vector_random_npr3_rhs1_MPI_3_DISABLE BOOL : ON

use PACKAGE-ENABLES|NO-EPETRA

use RHEL7_POST
Expand Down Expand Up @@ -2346,9 +2354,6 @@ use NODE-TYPE|CUDA_USE-RDC|YES_USE-PT|YES
use USE-RDC|YES
use PACKAGE-ENABLES|ALL-NO-EPETRA

opt-set-cmake-var Trilinos_AUTOGENERATE_TEST_RESOURCE_FILE BOOL : ON
opt-set-cmake-var Trilinos_CUDA_NUM_GPUS STRING : 4
opt-set-cmake-var Trilinos_CUDA_SLOTS_PER_GPU STRING : 2
opt-set-cmake-var CMAKE_CXX_FLAGS STRING FORCE : -Wall -Wunused-parameter -Werror=unused-parameter -Wshadow -Werror=shadow -pedantic -Werror=pedantic -Werror=sign-compare -Werror=sign-compare -Wtype-limits -Werror=type-limits -Wuninitialized -Werror=uninitialized

# This is temporarily disabled because it seems to be particularly sensitive to the spack-built
Expand Down
10 changes: 4 additions & 6 deletions packages/framework/pr_tools/LaunchDriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@



def get_launch_env(build_name : str, system : str):
def get_launch_env(system : str):
"""
Gets the launch environment based on the detected system.
This is an early environment that's required for running the driver.
Expand All @@ -34,16 +34,14 @@ def get_launch_env(build_name : str, system : str):
str: The environment used to launch the driver.
"""
env = ""
if "_rdc" in build_name:
env += " TRILINOS_MAX_CORES=96"

if env == "":
return ""
else:
return "env" + env + " "


def get_launch_cmd(build_name : str, system : str):
def get_launch_cmd(system : str):
"""
Gets the launch command based on the detected system.

Expand Down Expand Up @@ -95,8 +93,8 @@ def main(argv):

ds = DetermineSystem(args.build_name, args.supported_systems)

launch_env = get_launch_env(args.build_name, ds.system_name)
launch_cmd = get_launch_cmd(args.build_name, ds.system_name)
launch_env = get_launch_env(ds.system_name)
launch_cmd = get_launch_cmd(ds.system_name)
driver_args = get_driver_args(ds.system_name)

# Specify, and override the driver script for ATDM ATS2 builds. Note that
Expand Down
2 changes: 1 addition & 1 deletion packages/framework/pr_tools/PullRequestLinuxDriver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ test_cmd_options=(
--jenkins-job-number=${BUILD_NUMBER:?}
--req-mem-per-core=4.0
--max-cores-allowed=${TRILINOS_MAX_CORES:=29}
--num-concurrent-tests=4
--num-concurrent-tests=16
--test-mode=${mode}
--workspace-dir=${WORKSPACE:?}
--filename-packageenables=${WORKSPACE:?}/packageEnables.cmake
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from . import TrilinosPRConfigurationBase
from gen_config import GenConfig
from pathlib import Path
from .sysinfo import gpu_utils


class TrilinosPRConfigurationStandard(TrilinosPRConfigurationBase):
Expand Down Expand Up @@ -74,9 +75,19 @@ def execute_test(self):
"-DUSE_EXPLICIT_TRILINOS_CACHEFILE:BOOL=" + "ON" if self.arg_use_explicit_cachefile else "OFF",
]


if gpu_utils.has_nvidia_gpus():
self.message("-- REMARK: I see that I am running on a machine that has NVidia GPUs; I will feed TriBITS some data enabling GPU resource management")
slots_per_gpu = 2
gpu_indices = gpu_utils.list_nvidia_gpus()
self.message(f"-- REMARK: Using {slots_per_gpu} slots per GPU")
self.message(f"-- REMARK: Using GPUs {gpu_indices}")
cmd.append(f"-DEXTRA_CONFIGURE_ARGS:STRING=-DTrilinos_AUTOGENERATE_TEST_RESOURCE_FILE:BOOL=ON; -DTrilinos_CUDA_NUM_GPUS:STRING={len(gpu_indices)}; -DTrilinos_CUDA_SLOTS_PER_GPU:STRING={slots_per_gpu}")
sebrowne marked this conversation as resolved.
Show resolved Hide resolved

if self.arg_extra_configure_args:
cmd.append(f"-DEXTRA_CONFIGURE_ARGS:STRING={';'.join(self.arg_extra_configure_args)}")


self.message( "--- ctest version:")
if not self.args.dry_run:
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import subprocess
from shutil import which


def has_nvidia_gpus():
return bool(list_nvidia_gpus())


def _nvidia_smi():
if which('nvidia-smi'):
with open(os.devnull) as errout:
return subprocess.check_output('nvidia-smi --list-gpus', stderr=errout, shell=True).splitlines()
return []


def list_nvidia_gpus():
gpu_ids = []
try:
gpu_ids = [str(x) for x in range(0, len(_nvidia_smi()))]
except Exception as e:
raise RuntimeError("Failed to acquire list of gpus: {0}".format(str(e)))
return gpu_ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
# -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
"""
"""
from __future__ import print_function

import os

from unittest import TestCase

try: # pragma: no cover
import unittest.mock as mock # pragma: no cover
from unittest.mock import patch
except: # pragma: no cover
import mock # pragma: no cover
from mock import patch

import trilinosprhelpers.sysinfo as sysinfo


#==============================================================================
#
# M O C K H E L P E R S
#
#==============================================================================

def mock_nvidia_smi():
return ["GPU 0: Tesla V100S-PCIE-32GB (UUID: GPU-somehash1)",
"GPU 1: Tesla V100S-PCIE-32GB (UUID: GPU-somehash2)",
"GPU 2: Tesla V100S-PCIE-32GB (UUID: GPU-somehash3)",
"GPU 3: Tesla V100S-PCIE-32GB (UUID: GPU-somehash4)"]

def mock_which(thing_to_find):
return os.path.join(os.getcwd(), thing_to_find)


#==============================================================================
#
# T E S T S
#
#==============================================================================

class GpuUtilsTest(TestCase):
"""
Tests for gpu_utils.
"""
def setUp(self):
self.maxDiff = None

def test_list_nvidia_gpus(self):
"""
Test that sane output from nvidia-smi yields a sane list of gpu indices.
"""
print("")
with patch("trilinosprhelpers.sysinfo.gpu_utils._nvidia_smi", side_effect=mock_nvidia_smi):
ret = sysinfo.gpu_utils.list_nvidia_gpus()
self.assertEqual(["0", "1", "2", "3"], ret)

def test_has_nvidia_gpus(self):
"""
Test that sane output from nvidia-smi yields positive for system possessing NVidia GPUs.
"""
print("")
with patch("trilinosprhelpers.sysinfo.gpu_utils._nvidia_smi", side_effect=mock_nvidia_smi):
ret = sysinfo.gpu_utils.has_nvidia_gpus()
self.assertTrue(ret)

def test_nvidia_smi_output_without_smi(self):
"""
Test that without nvidia-smi available the smi interface returns an empty list of output.
"""
print("")
with patch("trilinosprhelpers.sysinfo.gpu_utils.which", return_value=None):
ret = sysinfo.gpu_utils._nvidia_smi()
self.assertEqual([], ret)
7 changes: 2 additions & 5 deletions packages/framework/pr_tools/unittests/test_LaunchDriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,12 @@ def setUp(self):

## Test LaunchDriver methods
def testUnitGetLaunchEnv(self):
env = ld.get_launch_env(self.build_name+"_rdc", "dne")
self.assertEqual(env, "env TRILINOS_MAX_CORES=96 ")

env = ld.get_launch_env(self.build_name, "dne")
env = ld.get_launch_env("dne")
self.assertEqual(env, "")


def testUnitGetLaunchCmd(self):
cmd = ld.get_launch_cmd(self.build_name, "dne")
cmd = ld.get_launch_cmd("dne")
self.assertEqual(cmd, " ")


Expand Down