diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 5f74d194037..56086ccb26a 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -122,19 +122,20 @@ def gather_logs(cluster, key, dest, source) { } def CI_summarize(verbose=false) { - cmd = """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ - python ${CI_LOCATION}/summarize.py \ - --log_directory=${env.LOG_DIR} - """ + def options = "" if (verbose) { - cmd = "${cmd} -v" + options = "${options} -v" } if (weekly || RELEASE) { - cmd = "${cmd} --send-mail" + options = "${options} --send-mail" } - sh "${cmd}" + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python ${CI_LOCATION}/summarize.py \ + --log_directory=${env.LOG_DIR} \ + ${options} + """ } def summarize(item, verbose=false, release=false, send_mail=false) { @@ -354,7 +355,7 @@ def skip() { } echo "Changeset is: ${changeStrings.toArray()}" - if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|prov\/cxi|prov\/lpp|contrib\/aws).*$/ }) { + if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|prov\/cxi|prov\/lpp|contrib\/aws|.github).*$/ }) { echo "DONT RUN!" return true } @@ -587,28 +588,81 @@ pipeline { } } } - stage ('mpich-water') { + stage ('oneccl-water') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_oneccl_water.json") + } + } + } + } + stage ('oneccl-grass') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_oneccl_grass.json") + } + } + } + } + stage ('oneccl-electric') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_oneccl_electric.json") + } + } + } + } + stage ('oneccl-fire') { + agent { + node { + label 'ze' + customWorkspace CUSTOM_WORKSPACE + } + } + options { skipDefaultCheckout() } + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_oneccl_fire.json") + } + } + } + } + stage ('mpich_grass') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_mpich_grass.json") + } + } + } + } + stage ('mpich_water') { steps { script { dir (CI_LOCATION) { - slurm_batch("water", "1", - "${env.LOG_DIR}/build_mpich_water_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=water""" - ) + build_ci("pr_build_mpich_water.json") } } } } - stage ('mpich-grass') { + stage ('impi_grass') { steps { script { dir (CI_LOCATION) { - slurm_batch("grass", "1", - "${env.LOG_DIR}/build_mpich_grass_log", - """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ - --build_item=mpich --build_hw=grass""" - ) + build_ci("pr_build_impi_grass.json") + } + } + } + } + stage ('impi_water') { + steps { + script { + dir (CI_LOCATION) { + build_ci("pr_build_impi_water.json") } } } @@ -621,33 +675,22 @@ pipeline { stage('mpichtestsuite-tcp') { steps { script { - dir (RUN_LOCATION) { - def providers = [['tcp', null]] - def MPIS = ["mpich"] + dir (CI_LOCATION) { + run_ci("CI_mpi_tcp_mpich_mpichtestsuite", "pr_mpich_mpichtestsuite_grass.json") if (env.WEEKLY.toBoolean()) { - MPIS = ["impi", "mpich"] + run_ci("CI_mpi_tcp_impi_mpichtestsuite", "pr_impi_mpichtestsuite_grass.json") } - for (def mpi in MPIS) { - run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "grass", "bulbasaur,ivysaur", "2", "${mpi}") - } - } + } } } } - stage('mpichtestsuite-verbs') { + stage('mpichtestsuite-verbs-rxm') { steps { script { - dir (RUN_LOCATION) { - def providers = [["verbs","rxm"]] - def MPIS = ["mpich"] + dir (CI_LOCATION) { + run_ci("CI_mpi_verbs-rxm_mpich_mpichtestsuite", "pr_mpich_mpichtestsuite_water.json") if (env.WEEKLY.toBoolean()) { - MPIS = ["impi", "mpich"] - } - for (def mpi in MPIS) { - run_middleware(providers, "mpichtestsuite", "mpichtestsuite", - "water", "squirtle,wartortle,articuno", "2", - "${mpi}") + run_ci("CI_mpi_verbs-rxm_impi_mpichtestsuite", "pr_impi_mpichtestsuite_water.json") } } } @@ -828,54 +871,49 @@ pipeline { steps { script { dir (RUN_LOCATION) { - run_middleware([["tcp", null],["sockets", null]], - "multinode_performance", "multinode", "grass", "bulbasaur,ivysaur", "2") - run_middleware([["verbs", "rxm"]], "multinode_performance", - "multinode", "water", "squirtle,wartortle,articuno", "2") + run_ci("CI_multinode_performance_grass", + "pr_multinode_performance_grass.json") + run_ci("CI_multinode_performance_water", + "pr_multinode_performance_water.json") } } } } - stage ('oneCCL') { + stage ('oneccl_grass') { steps { script { - dir (RUN_LOCATION) { - run_middleware([["verbs", null]], "oneCCL", - "oneccl", "water", - "squirtle,wartortle,articuno", "2") - run_middleware([["shm", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,ivysaur", "1") - run_middleware([["psm3", null]], "oneCCL", - "oneccl", "water", - "squirtle,wartortle,articuno", "2") - run_middleware([["tcp", null]], "oneCCL", - "oneccl", "grass", "bulbasaur,ivysaur", "2") - run_middleware([["shm", null]], "oneCCL_DSA", - "oneccl", "electric", "pikachu", "1", null, null, - """CCL_ATL_SHM=1 FI_SHM_DISABLE_CMA=1 \ - FI_SHM_USE_DSA_SAR=1 FI_LOG_LEVEL=warn""") + dir (CI_LOCATION) { + run_ci("CI_oneccl_grass", "pr_oneccl_grass_tcp.json") + run_ci("CI_oneccl_grass", "pr_oneccl_grass_shm.json") + } + } + } + } + stage ('oneccl_water') { + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_oneccl_water", "pr_oneccl_water.json") } } } } - stage ('oneCCL-GPU-v3') { + stage ('oneccl_electric') { + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_oneccl_electric", "pr_oneccl_electric.json") + } + } + } + } + stage ('oneCCL-fire') { agent { node { label 'ze' } } options { skipDefaultCheckout() } steps { script { - dir (RUN_LOCATION) { - run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", - "fire", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") - run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", - "fire", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") - run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", - "fire", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") - run_middleware([["shm", null]], "oneCCL-GPU-v3", "onecclgpu", - "fire", "torchic", "1", null, null, - "FI_HMEM_DISABLE_P2P=1") + dir (CI_LOCATION) { + run_ci("CI_oneccl_fire", "pr_oneccl_fire.json") } } } diff --git a/fabtests/scripts/runmultinode.sh b/fabtests/scripts/runmultinode.sh index d4491de48b8..ebe564d0a8e 100755 --- a/fabtests/scripts/runmultinode.sh +++ b/fabtests/scripts/runmultinode.sh @@ -1,7 +1,7 @@ #!/bin/bash -Options=$(getopt --options h:,n:,p:,I:,-x:,z: \ - --longoptions hosts:,processes-per-node:,provider:,xfer-method:,iterations:,ci:,cleanup,help \ +Options=$(getopt --options h:,n:,p:,I:,-x:-E:,z: \ + --longoptions hosts:,processes-per-node:,provider:,xfer-method:,env:,iterations:,ci:,cleanup,help \ -- "$@") eval set -- "$Options" @@ -10,7 +10,7 @@ hosts=[] ppn=1 iterations=1 pattern="" -xfer-method="msg" +xfer_method="msg" cleanup=false help=false ci="" @@ -19,7 +19,7 @@ while true; do case "$1" in -h|--hosts) IFS=',' read -r -a hosts <<< "$2"; shift 2 ;; - -n|--processes-per-node) + -n|--processes-per-node) ppn=$2; shift 2 ;; -p|--provider) provider="$2"; shift 2 ;; @@ -30,10 +30,17 @@ while true; do --cleanup) cleanup=true; shift ;; -x|--xfer-method) - xfer-method="$2"; shift 2 ;; + xfer_method="$2"; shift 2 ;; + -E|--env) + delimiter="=" + value=${2#*$delimiter} + var=${2:0:$(( ${#2} - ${#value} - ${#delimiter} ))} + EXPORT_STRING="export $var=\"$value\"" + EXPORT_ENV="${EXPORT_ENV}${EXPORT_STRING}; " + shift 2 ;; --ci) ci="$2"; shift 2 ;; - --help) + --help) help=true; shift ;; --) shift; break ;; @@ -41,21 +48,21 @@ while true; do done if $help ; then - echo "Run the multinode test suite on the nodes provided for many procceses" + echo "Run the multinode test suite on the nodes provided for many procceses" echo "multinode tests are run in performance mode" echo "Options" echo "\t-h,--hosts list of host names to run the tests on" - echo "\t-n,--processes-per-node number of processes to be run on each node.\ - Total number of fi_mulinode tests run will be n*number of hosts" + echo "\t-n,--processes-per-node number of processes to be run on each node. Total number of fi_mulinode tests run will be n*number of hosts" echo "\t-p,--provider libfabric provider to run the multinode tests on" - echo "\t-C,--cabability multinode cabability to use (rma or default: msg)" - echo "\t-I,-- iterations number of iterations for the multinode test \ + echo "\t-x,--xfer-method multinode transfer method/capability to use (rma or default: msg)" + echo "\t-E,--env export provided variable name and value" + echo "\t-I,--iterations number of iterations for the multinode test \ to run each pattern on" echo "\t--cleanup end straggling processes. Does not rerun tests" echo "\t--help show this message" exit 1 fi - + num_hosts=${#hosts[@]} max_ranks=$(($num_hosts*$ppn)) ranks=$max_ranks; @@ -65,7 +72,7 @@ output="multinode_server_${num_hosts}_${ppn}.log" ret=0 if ! $cleanup ; then - cmd="${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer-method $pattern -I $iterations -T" + cmd="${EXPORT_ENV} ${ci}fi_multinode -n $ranks -s $server -p '$provider' -x $xfer_method $pattern -I $iterations -T" echo $cmd for node in "${hosts[@]}"; do for i in $(seq 1 $ppn); do @@ -73,7 +80,7 @@ if ! $cleanup ; then echo STARTING SERVER if [ "$ci" == "" ]; then ssh $node $cmd &> $output & - else + else ssh $node $cmd | tee $output & fi server_pid=$! @@ -104,4 +111,4 @@ if ! $cleanup ; then echo "Output: $PWD/$output" fi -exit $ret +exit $ret