trilinos · achauphan · Dec 18, 2023 · Dec 8, 2022 · Dec 8, 2022 · Dec 14, 2022
@@ -2305,6 +2305,14 @@ opt-set-cmake-var Kokkos_CoreUnitTest_CudaTimingBased_MPI_1_DISABLE BOOL : ON
 #  MPI issue (TRILFRAME-552)
 opt-set-cmake-var ROL_example_PinT_parabolic-control_AugmentedSystem_test_MPI_2_DISABLE BOOL FORCE : ON
 
+# This is run serially to try to prevent some unpredictable issues where these tests may be trying
+#  overlap other executing tests on the same GPU after introducing usage for multiple GPUs for testing.
+#  (https://github.com/trilinos/Trilinos/pull/11391)
+opt-set-cmake-var Kokkos_CoreUnitTest_Cuda1_MPI_1_RUN_SERIAL BOOL FORCE : ON
+# Test has been unstable, disabling with Sam B's approval.i
+#  (https://github.com/trilinos/Trilinos/pull/11391)
+opt-set-cmake-var Adelus_vector_random_npr3_rhs1_MPI_3_DISABLE BOOL : ON
+
 use PACKAGE-ENABLES|NO-EPETRA
 
 use RHEL7_POST
@@ -2346,9 +2354,6 @@ use NODE-TYPE|CUDA_USE-RDC|YES_USE-PT|YES
 use USE-RDC|YES
 use PACKAGE-ENABLES|ALL-NO-EPETRA
 
-opt-set-cmake-var Trilinos_AUTOGENERATE_TEST_RESOURCE_FILE BOOL   : ON
-opt-set-cmake-var Trilinos_CUDA_NUM_GPUS                   STRING : 4
-opt-set-cmake-var Trilinos_CUDA_SLOTS_PER_GPU              STRING : 2
 opt-set-cmake-var CMAKE_CXX_FLAGS                          STRING FORCE : -Wall -Wunused-parameter -Werror=unused-parameter -Wshadow -Werror=shadow -pedantic -Werror=pedantic -Werror=sign-compare -Werror=sign-compare -Wtype-limits -Werror=type-limits -Wuninitialized -Werror=uninitialized
 
 # This is temporarily disabled because it seems to be particularly sensitive to the spack-built

@@ -25,7 +25,7 @@
 
 
 
-def get_launch_env(build_name : str, system : str):
+def get_launch_env(system : str):
   """
   Gets the launch environment based on the detected system.
   This is an early environment that's required for running the driver.
@@ -34,16 +34,14 @@ def get_launch_env(build_name : str, system : str):
       str: The environment used to launch the driver.
   """
   env = ""
-  if "_rdc" in build_name:
-      env += " TRILINOS_MAX_CORES=96"
 
   if env == "":
       return ""
   else:
       return "env" + env + " "
 
 
-def get_launch_cmd(build_name : str, system : str):
+def get_launch_cmd(system : str):
   """
   Gets the launch command based on the detected system.
 
@@ -95,8 +93,8 @@ def main(argv):
 
   ds = DetermineSystem(args.build_name, args.supported_systems)
 
-  launch_env = get_launch_env(args.build_name, ds.system_name)
-  launch_cmd = get_launch_cmd(args.build_name, ds.system_name)
+  launch_env = get_launch_env(ds.system_name)
+  launch_cmd = get_launch_cmd(ds.system_name)
   driver_args = get_driver_args(ds.system_name)
 
   # Specify, and override the driver script for ATDM ATS2 builds. Note that

@@ -201,7 +201,7 @@ test_cmd_options=(
     --jenkins-job-number=${BUILD_NUMBER:?}
     --req-mem-per-core=4.0
     --max-cores-allowed=${TRILINOS_MAX_CORES:=29}
-    --num-concurrent-tests=4
+    --num-concurrent-tests=16
     --test-mode=${mode}
     --workspace-dir=${WORKSPACE:?}
     --filename-packageenables=${WORKSPACE:?}/packageEnables.cmake

@@ -10,6 +10,7 @@
 from . import TrilinosPRConfigurationBase
 from gen_config import GenConfig
 from pathlib import Path
+from .sysinfo import gpu_utils
 
 
 class TrilinosPRConfigurationStandard(TrilinosPRConfigurationBase):
@@ -74,9 +75,19 @@ def execute_test(self):
                 "-DUSE_EXPLICIT_TRILINOS_CACHEFILE:BOOL=" + "ON" if self.arg_use_explicit_cachefile else "OFF",
              ]
 
+
+        if gpu_utils.has_nvidia_gpus():
+            self.message("-- REMARK: I see that I am running on a machine that has NVidia GPUs; I will feed TriBITS some data enabling GPU resource management")
+            slots_per_gpu = 2
+            gpu_indices = gpu_utils.list_nvidia_gpus()
+            self.message(f"-- REMARK: Using {slots_per_gpu} slots per GPU")
+            self.message(f"-- REMARK: Using GPUs {gpu_indices}")
+            cmd.append(f"-DEXTRA_CONFIGURE_ARGS:STRING=-DTrilinos_AUTOGENERATE_TEST_RESOURCE_FILE:BOOL=ON; -DTrilinos_CUDA_NUM_GPUS:STRING={len(gpu_indices)}; -DTrilinos_CUDA_SLOTS_PER_GPU:STRING={slots_per_gpu}")
+
         if self.arg_extra_configure_args:
             cmd.append(f"-DEXTRA_CONFIGURE_ARGS:STRING={';'.join(self.arg_extra_configure_args)}")
 
+
         self.message( "--- ctest version:")
         if not self.args.dry_run:
             try:

@@ -0,0 +1,23 @@
+import os
+import subprocess
+from shutil import which
+
+
+def has_nvidia_gpus():
+    return bool(list_nvidia_gpus())
+
+
+def _nvidia_smi():
+    if which('nvidia-smi'):
+        with open(os.devnull) as errout:
+            return subprocess.check_output('nvidia-smi --list-gpus', stderr=errout, shell=True).splitlines()
+    return []
+
+
+def list_nvidia_gpus():
+    gpu_ids = []
+    try:
+        gpu_ids = [str(x) for x in range(0, len(_nvidia_smi()))]
+    except Exception as e:
+        raise RuntimeError("Failed to acquire list of gpus: {0}".format(str(e)))
+    return gpu_ids
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
+"""
+"""
+from __future__ import print_function
+
+import os
+
+from unittest import TestCase
+
+try:                                    # pragma: no cover
+    import unittest.mock as mock        # pragma: no cover
+    from unittest.mock import patch
+except:                                 # pragma: no cover
+    import mock                         # pragma: no cover
+    from mock import patch
+
+import trilinosprhelpers.sysinfo as sysinfo
+
+
+#==============================================================================
+#
+#                         M O C K   H E L P E R S
+#
+#==============================================================================
+
+def mock_nvidia_smi():
+    return ["GPU 0: Tesla V100S-PCIE-32GB (UUID: GPU-somehash1)",
+            "GPU 1: Tesla V100S-PCIE-32GB (UUID: GPU-somehash2)",
+            "GPU 2: Tesla V100S-PCIE-32GB (UUID: GPU-somehash3)",
+            "GPU 3: Tesla V100S-PCIE-32GB (UUID: GPU-somehash4)"]
+
+def mock_which(thing_to_find):
+    return os.path.join(os.getcwd(), thing_to_find)
+
+
+#==============================================================================
+#
+#                                T E S T S
+#
+#==============================================================================
+
+class GpuUtilsTest(TestCase):
+    """
+    Tests for gpu_utils.
+    """
+    def setUp(self):
+        self.maxDiff = None
+
+    def test_list_nvidia_gpus(self):
+        """
+        Test that sane output from nvidia-smi yields a sane list of gpu indices.
+        """
+        print("")
+        with patch("trilinosprhelpers.sysinfo.gpu_utils._nvidia_smi", side_effect=mock_nvidia_smi):
+            ret = sysinfo.gpu_utils.list_nvidia_gpus()
+        self.assertEqual(["0", "1", "2", "3"], ret)
+
+    def test_has_nvidia_gpus(self):
+        """
+        Test that sane output from nvidia-smi yields positive for system possessing NVidia GPUs.
+        """
+        print("")
+        with patch("trilinosprhelpers.sysinfo.gpu_utils._nvidia_smi", side_effect=mock_nvidia_smi):
+            ret = sysinfo.gpu_utils.has_nvidia_gpus()
+        self.assertTrue(ret)
+
+    def test_nvidia_smi_output_without_smi(self):
+        """
+        Test that without nvidia-smi available the smi interface returns an empty list of output.
+        """
+        print("")
+        with patch("trilinosprhelpers.sysinfo.gpu_utils.which", return_value=None):
+            ret = sysinfo.gpu_utils._nvidia_smi()
+        self.assertEqual([], ret)
@@ -27,15 +27,12 @@ def setUp(self):
 
     ## Test LaunchDriver methods
     def testUnitGetLaunchEnv(self):
-      env = ld.get_launch_env(self.build_name+"_rdc", "dne")
-      self.assertEqual(env, "env TRILINOS_MAX_CORES=96 ")
-
-      env = ld.get_launch_env(self.build_name, "dne")
+      env = ld.get_launch_env("dne")
       self.assertEqual(env, "")
 
 
     def testUnitGetLaunchCmd(self):
-      cmd = ld.get_launch_cmd(self.build_name, "dne")
+      cmd = ld.get_launch_cmd("dne")
       self.assertEqual(cmd, " ")