Skip to content

Commit

Permalink
updated nvshmem related build and runscripts: module load nvshmem doe…
Browse files Browse the repository at this point in the history
…s not work on Perlmutter and one has to build it from source for now; will need to change it back once the nvshmem module is working again
  • Loading branch information
liuyangzhuan committed Jan 6, 2025
1 parent 6574d1f commit 7c3fb7a
Show file tree
Hide file tree
Showing 18 changed files with 110 additions and 104 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ module load cudatoolkit/12.2
# module load cray-libsci/22.11.1.2
module load cray-libsci/23.12.5
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0
ulimit -s unlimited
#MPI settings:
export MPICH_GPU_SUPPORT_ENABLED=1
Expand Down Expand Up @@ -37,22 +36,23 @@ nmpipergpu=1
export SUPERLU_MPI_PROCESS_PER_GPU=$nmpipergpu # 2: this can better saturate GPU

# ##NVSHMEM settings:
# # NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# export NVSHMEM_USE_GDRCOPY=1
# export NVSHMEM_MPI_SUPPORT=1
# export MPI_HOME=${MPICH_DIR}
# export NVSHMEM_LIBFABRIC_SUPPORT=1
# export LIBFABRIC_HOME=/opt/cray/libfabric/1.20.1
# export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH
# export NVSHMEM_DISABLE_CUDA_VMM=1
# export FI_CXI_OPTIMIZED_MRS=false
# export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
# export NVSHMEM_BOOTSTRAP=MPI
# export NVSHMEM_REMOTE_TRANSPORT=libfabric

# #export NVSHMEM_DEBUG=TRACE
# #export NVSHMEM_DEBUG_SUBSYS=ALL
# #export NVSHMEM_DEBUG_FILE=nvdebug_success
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
export NVSHMEM_LIBFABRIC_SUPPORT=1
export LIBFABRIC_HOME=/opt/cray/libfabric/1.20.1
export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH
export NVSHMEM_DISABLE_CUDA_VMM=1
export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
#export NVSHMEM_DEBUG_FILE=nvdebug_success

if [[ $NERSC_HOST == edison ]]; then
CORES_PER_NODE=24
Expand All @@ -70,8 +70,8 @@ else
exit $EXIT_HOST
fi

nprows=(1)
npcols=(1)
nprows=(2)
npcols=(2)
npz=(1)
nrhs=(1)
NTH=1
Expand Down Expand Up @@ -130,15 +130,15 @@ export MPICH_MAX_THREAD_SAFETY=multiple
# for MAT in rma10.mtx
# for MAT in raefsky3.mtx
# for MAT in s2D9pt2048.rua raefsky3.mtx rma10.mtx
# for MAT in s1_mat_0_126936.bin # for MAT in s1_mat_0_126936.bin
for MAT in s1_mat_0_126936.bin # for MAT in s1_mat_0_126936.bin
# for MAT in s2D9pt2048.rua
# for MAT in nlpkkt80.bin dielFilterV3real.bin Ga19As19H42.bin
# for MAT in dielFilterV3real.bin
# for MAT in s2D9pt1536.rua
# for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
# for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
# for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
for MAT in temp_13k.mtx
# for MAT in temp_13k.mtx
do
mkdir -p $MAT
for ii in `seq 1 $NREP`
Expand All @@ -151,16 +151,17 @@ do
# # export SUPERLU_ACC_SOLVE=1
# srun -N 4 -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}_nmpipergpu${nmpipergpu}

SUPERLU_ACC_OFFLOAD=1
export GPU3DVERSION=0
export SUPERLU_ACC_SOLVE=0
echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores compute-sanitizer --tool=memcheck --leak-check full ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}

# SUPERLU_ACC_OFFLOAD=1
# export GPU3DVERSION=1
# export GPU3DVERSION=0
# export SUPERLU_ACC_SOLVE=0
# echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
# srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}
# srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores compute-sanitizer --tool=memcheck --leak-check full ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}

SUPERLU_ACC_OFFLOAD=1
export GPU3DVERSION=1
export SUPERLU_ACC_SOLVE=1
echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}


# export SUPERLU_ACC_SOLVE=1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ module load cudatoolkit/12.2
# module load cray-libsci/22.11.1.2
module load cray-libsci/23.12.5
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0
ulimit -s unlimited
#MPI settings:
export MPICH_GPU_SUPPORT_ENABLED=1
Expand Down Expand Up @@ -39,22 +38,23 @@ export SUPERLU_MPI_PROCESS_PER_GPU=$nmpipergpu # 2: this can better saturate GPU
export SUPERLU_RANKORDER='XY' # Be careful: XY needs to be used when NOROWPERM or SolveOnly is used for 3D grids

# ##NVSHMEM settings:
# # NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# export NVSHMEM_USE_GDRCOPY=1
# export NVSHMEM_MPI_SUPPORT=1
# export MPI_HOME=${MPICH_DIR}
# export NVSHMEM_LIBFABRIC_SUPPORT=1
# export LIBFABRIC_HOME=/opt/cray/libfabric/1.20.1
# export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH
# export NVSHMEM_DISABLE_CUDA_VMM=1
# export FI_CXI_OPTIMIZED_MRS=false
# export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
# export NVSHMEM_BOOTSTRAP=MPI
# export NVSHMEM_REMOTE_TRANSPORT=libfabric

# #export NVSHMEM_DEBUG=TRACE
# #export NVSHMEM_DEBUG_SUBSYS=ALL
# #export NVSHMEM_DEBUG_FILE=nvdebug_success
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
export NVSHMEM_LIBFABRIC_SUPPORT=1
export LIBFABRIC_HOME=/opt/cray/libfabric/1.20.1
export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH
export NVSHMEM_DISABLE_CUDA_VMM=1
export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
#export NVSHMEM_DEBUG_FILE=nvdebug_success

if [[ $NERSC_HOST == edison ]]; then
CORES_PER_NODE=24
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module load cmake
module load cudatoolkit/12.2
module unload cray-libsci
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

ulimit -s unlimited
#MPI settings:
export MPICH_GPU_SUPPORT_ENABLED=1
Expand Down Expand Up @@ -35,8 +35,9 @@ export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU
export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU

##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ module load cmake
module load cudatoolkit/12.2
module unload cray-libsci
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

#MPI settings:
export MPICH_GPU_SUPPORT_ENABLED=1
Expand Down Expand Up @@ -39,8 +38,9 @@ export SUPERLU_NUM_GPU_STREAMS=1
export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU

##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ module load cudatoolkit
module load cray-libsci
module load cmake
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

# ulimit -s unlimited
#MPI settings:
Expand Down Expand Up @@ -44,8 +43,9 @@ export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU


##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand All @@ -57,7 +57,6 @@ export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric
# export MPICH_OFI_NIC_POLICY=GPU

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
Expand All @@ -83,12 +82,12 @@ fi
# npz=(64 32 16)
# nrhs=(1 50)

nprows=(1 )
npcols=(1 )
nprows=(2 )
npcols=(2 )
npz=(1 )
nrhs=(1)

NTH=4
NTH=1
NREP=1
# NODE_VAL_TOT=1

Expand Down Expand Up @@ -141,27 +140,26 @@ export MPICH_MAX_THREAD_SAFETY=multiple
# for MAT in dielFilterV3real.bin
# for MAT in Geo_1438.bin s2D9pt2048.rua raefsky3.mtx rma10.mtx
# for MAT in Geo_1438.bin
# for MAT in s1_mat_0_126936.bin
for MAT in s1_mat_0_126936.bin
# for MAT in marcus_100000.dat
# for MAT in marcus_500000.dat
# for MAT in s2D9pt2048.rua
# for MAT in s2D9pt1536.rua
# for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
# for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
# for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
for MAT in temp_13k.mtx
# for MAT in temp_13k.mtx
do
mkdir -p $MAT
for ii in `seq 1 $NREP`
do
export SUPERLU_ACC_SOLVE=0


# # # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
# # export SUPERLU_ACC_OFFLOAD=0
# srun -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}

unset SUPERLU_ACC_SOLVE
export SUPERLU_ACC_SOLVE=1
echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}"
srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}
# srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores compute-sanitizer --tool=memcheck --leak-check full --report-api-errors no ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ module load cudatoolkit
module load cray-libsci
module load cmake
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

# ulimit -s unlimited
#MPI settings:
Expand Down Expand Up @@ -44,8 +43,9 @@ export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU


##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand All @@ -57,7 +57,6 @@ export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric
# export MPICH_OFI_NIC_POLICY=GPU

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ module load cudatoolkit
module load cray-libsci
module load cmake
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

# ulimit -s unlimited
#MPI settings:
Expand Down Expand Up @@ -44,8 +43,9 @@ export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU


##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand All @@ -57,7 +57,6 @@ export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric
# export MPICH_OFI_NIC_POLICY=GPU

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ module load cudatoolkit
module load cray-libsci
module load cmake
# module use /global/common/software/nersc/pe/modulefiles/latest
module load nvshmem/2.11.0

#OpenMP settings:
export OMP_NUM_THREADS=1
Expand All @@ -33,8 +32,9 @@ export SUPERLU_BIND_MPI_GPU=1
export SUPERLU_ACC_OFFLOAD=0 # this can be 0 to do CPU tests on GPU nodes
export SUPERLU_ACC_SOLVE=1

##NVSHMEM settings:
# NVSHMEM_HOME=/global/cfs/cdirs/m2957/liuyangz/my_software/nvshmem_perlmutter/nvshmem_src_2.8.0-3/build/
# ##NVSHMEM settings:
# module load nvshmem/2.11.0
NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
export NVSHMEM_USE_GDRCOPY=1
export NVSHMEM_MPI_SUPPORT=1
export MPI_HOME=${MPICH_DIR}
Expand All @@ -45,10 +45,13 @@ export NVSHMEM_DISABLE_CUDA_VMM=1
export FI_CXI_OPTIMIZED_MRS=false
export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
export NVSHMEM_BOOTSTRAP=MPI
export NVSHMEM_REMOTE_TRANSPORT=libfabric

#export NVSHMEM_DEBUG=TRACE
#export NVSHMEM_DEBUG_SUBSYS=ALL
#export NVSHMEM_DEBUG_FILE=nvdebug_success


#run the application
#matrix=(nimrodMatrix-B.mtx nimrodMatrix-N.mtx)
INPUT_DIR=$CFS/m2957/liuyangz/my_research/matrix/
Expand Down
Loading

0 comments on commit 7c3fb7a

Please sign in to comment.