From fd71062bd8bed168173af0054292b86160c69827 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 20 Sep 2021 20:50:49 -0700 Subject: [PATCH] Updated zScatter_B3d() to support uneven block row partition of {A,B} in 3D interface. Only complex16 is implemented. --- CMakeLists.txt | 8 +- EXAMPLE/Makefile | 2 +- EXAMPLE/pddrive1.c | 7 +- EXAMPLE/pddrive1_ABglobal.c | 6 ++ EXAMPLE/pddrive2_ABglobal.c | 7 +- EXAMPLE/pddrive3_ABglobal.c | 7 +- EXAMPLE/pddrive4.c | 7 +- EXAMPLE/pddrive4_ABglobal.c | 6 ++ EXAMPLE/pddrive_ABglobal.c | 7 +- EXAMPLE/pddrive_spawn.c | 7 +- EXAMPLE/pzdrive.c | 7 +- EXAMPLE/pzdrive1.c | 7 +- EXAMPLE/pzdrive1_ABglobal.c | 7 +- EXAMPLE/pzdrive2_ABglobal.c | 7 +- EXAMPLE/pzdrive3_ABglobal.c | 7 +- EXAMPLE/pzdrive3d.c | 1 + EXAMPLE/pzdrive4.c | 7 +- EXAMPLE/pzdrive4_ABglobal.c | 6 ++ EXAMPLE/pzdrive_ABglobal.c | 7 +- EXAMPLE/pzdrive_spawn.c | 7 +- EXAMPLE/zcreate_matrix3d.c | 51 +++++++--- SRC/CMakeLists.txt | 6 ++ SRC/dlustruct_gpu.h | 8 +- SRC/dnrformat_loc3d.c | 2 + SRC/dsuperlu_gpu.cu | 4 +- SRC/pdgstrf3d.c | 16 +++- SRC/psgstrf3d.c | 9 +- SRC/pzgstrf3d.c | 16 +++- SRC/pzutil.c | 6 +- SRC/slustruct_gpu.h | 8 +- SRC/ssuperlu_gpu.cu | 4 +- SRC/supermatrix.h | 9 ++ SRC/zlustruct_gpu.h | 8 +- SRC/znrformat_loc3d.c | 179 ++++++++++++++++++++++++++++++++++-- SRC/zsuperlu_gpu.cu | 4 +- run_cmake_build.sh | 3 + 36 files changed, 386 insertions(+), 74 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f13a12a..2d52b6fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) option(enable_single "Enable single precision library" OFF) -option(enable_complex16 "Enable complex16 precision library" OFF) +option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) option(XSDK_ENABLE_Fortran "Enable Fortran" ON) @@ -224,9 +224,9 @@ if (enable_openmp) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}") -# On edison, OpenMP_EXE_LINKER_FLAGS is empty -# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# The following causes problem with cmake/3.20.+ +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'") message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'") endif() diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 37387b52..ca5620b3 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -53,7 +53,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o -ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o #znrformat_loc3d.o ZEXM3D1 = pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o ZEXM3D2 = pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o ZEXM3D3 = pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index c2ccd067..72c88a6e 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c index 7f06b70e..7686b79c 100644 --- a/EXAMPLE/pddrive1_ABglobal.c +++ b/EXAMPLE/pddrive1_ABglobal.c @@ -72,6 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c index 57ebadf3..e908a6ca 100644 --- a/EXAMPLE/pddrive2_ABglobal.c +++ b/EXAMPLE/pddrive2_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c index 2e2a7433..e20c664d 100644 --- a/EXAMPLE/pddrive3_ABglobal.c +++ b/EXAMPLE/pddrive3_ABglobal.c @@ -78,7 +78,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index ca984a81..dbe9ee08 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -74,7 +74,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c index 9ff46dd8..2cf76078 100644 --- a/EXAMPLE/pddrive4_ABglobal.c +++ b/EXAMPLE/pddrive4_ABglobal.c @@ -71,6 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c index a47388b5..3541ab92 100644 --- a/EXAMPLE/pddrive_ABglobal.c +++ b/EXAMPLE/pddrive_ABglobal.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 131ea29c..b119b46e 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 3878558d..342b91a5 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -74,7 +74,12 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) VT_traceoff(); diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c index b65733b2..69aea1d0 100644 --- a/EXAMPLE/pzdrive1.c +++ b/EXAMPLE/pzdrive1.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c index 4437e4a8..bf47169f 100644 --- a/EXAMPLE/pzdrive1_ABglobal.c +++ b/EXAMPLE/pzdrive1_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c index 9959465b..96866c35 100644 --- a/EXAMPLE/pzdrive2_ABglobal.c +++ b/EXAMPLE/pzdrive2_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c index c83cf1a3..144e3f6e 100644 --- a/EXAMPLE/pzdrive3_ABglobal.c +++ b/EXAMPLE/pzdrive3_ABglobal.c @@ -77,7 +77,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 8bbed679..3e7f1463 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -183,6 +183,7 @@ main (int argc, char *argv[]) INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------ */ superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + // grid.rankorder = 1; if(grid.iam==0) { MPI_Query_thread(&omp_mpi_level); diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c index 7453d2ac..33451140 100644 --- a/EXAMPLE/pzdrive4.c +++ b/EXAMPLE/pzdrive4.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c index 5515e885..9b3ff81b 100644 --- a/EXAMPLE/pzdrive4_ABglobal.c +++ b/EXAMPLE/pzdrive4_ABglobal.c @@ -70,6 +70,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c index c3d798c1..b6f48554 100644 --- a/EXAMPLE/pzdrive_ABglobal.c +++ b/EXAMPLE/pzdrive_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index 30a28dd1..8dab3751 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c index b3c43ffd..5f0f7b6f 100644 --- a/EXAMPLE/zcreate_matrix3d.c +++ b/EXAMPLE/zcreate_matrix3d.c @@ -341,17 +341,44 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, nzval[0] = 0.1; #endif - /* Compute the number of rows to be distributed to local process */ - m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); - m_loc_fst = m_loc; - /* When m / procs is not an integer */ - if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) - { - /*m_loc = m_loc+1; - m_loc_fst = m_loc;*/ - if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ - m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); - } +// /* Compute the number of rows to be distributed to local process */ +// m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); +// m_loc_fst = m_loc; +// /* When m / procs is not an integer */ +// if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) +// { +// /*m_loc = m_loc+1; +// m_loc_fst = m_loc;*/ +// if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ +// m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); +// } + + switch(iam) { + case 0: + m_loc=111; fst_row=0; + break; + case 1: + m_loc=84; fst_row=111; + break; + case 2: + m_loc=108; fst_row=195; + break; + case 3: + m_loc=84; fst_row=303; + break; + case 4: + m_loc=108; fst_row=387; + break; + case 5: + m_loc=84; fst_row=495; + break; + case 6: + m_loc=108; fst_row=579; + break; + case 7: + m_loc=84; fst_row=687; + break; + } /* Create compressed column matrix for GA. */ zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, @@ -379,7 +406,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; /* Set up row pointers */ rowptr[0] = 0; - fst_row = iam * m_loc_fst; +// fst_row = iam * m_loc_fst; nnz_loc = 0; for (j = 0; j < m_loc; ++j) { diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 51f10a42..ef071485 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -294,6 +294,12 @@ if(CUDAToolkit_FOUND) # this is found in top-level CMakeLists.txt target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas) endif() +# This is recommended by modern cmake: +# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html +if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt + target_link_libraries(superlu_dist OpenMP::OpenMP_C) +endif() + target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS) if(MSVC AND BUILD_SHARED_LIBS) set_target_properties(superlu_dist PROPERTIES diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index 23187e66..1cdc366d 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_ int_t *LrowindPtr; /* A single vector */ double *LnzvalVec; /* A single vector */ - int_t *LnzvalPtr; /* A single vector */ - int_t *LnzvalPtr_host; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ int_t *UrowindVec; /* A single vector */ int_t *UrowindPtr; /* A single vector */ @@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_ int_t *UnzvalPtr_host; double *UnzvalVec; /* A single vector */ - int_t *UnzvalPtr; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + /*gpu pointers for easy block accesses */ local_l_blk_info_t *local_l_blk_infoVec; int_t *local_l_blk_infoPtr; @@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_ int_t *xsup; gridinfo_t *grid; - double ScatterMOPCounter; double ScatterMOPTimer; double GemmFLOPCounter; diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 625df8ea..ae1f1a86 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -80,6 +80,8 @@ void dGatherNRformat_loc3d nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + + /* Gathered to layer 0. Other procs do not have these counts */ MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, 1, mpi_int_t, 0, grid3d->zscp.comm); MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 93c72c18..832879d2 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -766,6 +766,7 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->LnzvalVec)); checkCuda(cudaFree(A_gpu->LnzvalPtr)); free(A_gpu->LnzvalPtr_host); + /*freeing the pinned memory*/ int_t streamId = 0; checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); @@ -798,8 +799,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->grid)); - - checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); @@ -814,7 +813,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); - checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); checkCuda(cudaFree(A_gpu->jib_lookupVec)); diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index dbef9da9..0deebd74 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -15,8 +15,9 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -225,14 +226,14 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    dLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // dLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -339,8 +340,13 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    dfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index f071d593..2c73580d 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -233,7 +233,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    sLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // sLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -340,8 +340,13 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    sfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 88eafa21..03e60b2e 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -224,14 +225,14 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    zLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // zLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -338,8 +339,13 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    zfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index c784c497..cd5c69e4 100644
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -810,6 +810,10 @@ void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     SUPERLU_FREE(A3d->nnz_disp);
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
 } /* zDestroy_A3d_gathered_on_2d */
@@ -889,5 +893,3 @@ zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
     CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
 #endif
 }
-
-
diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h
index 48038dbf..9475fba8 100644
--- a/SRC/slustruct_gpu.h
+++ b/SRC/slustruct_gpu.h
@@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     float  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     float  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
index adb67693..78b70b83 100644
--- a/SRC/ssuperlu_gpu.cu
+++ b/SRC/ssuperlu_gpu.cu
@@ -766,6 +766,7 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -798,8 +799,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -814,7 +813,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 9913aa6b..1d720355 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -205,6 +205,15 @@ typedef struct NRformat_loc3d
     int *nnz_disp;
     int *b_counts_int;
     int *b_disp;
+
+    /* The following 4 structures are used for scattering
+       solution X from 2D grid-0 back to 3D processes */
+    int num_procs_to_send;  
+    int *procs_to_send_list;
+    int *send_count_list;
+    int num_procs_to_recv;
+    int *procs_recv_from_list;
+    int *recv_count_list;
 } NRformat_loc3d;
 
 
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
index 1064d424..39288000 100644
--- a/SRC/zlustruct_gpu.h
+++ b/SRC/zlustruct_gpu.h
@@ -77,8 +77,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     doublecomplex  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -86,7 +86,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     doublecomplex  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -108,7 +109,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 89de8dd0..f93cb215 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -22,6 +22,7 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
+#include 
 
 /* Dst <- BlockByBlock (Src), reshape the block storage. */
 static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
@@ -66,6 +67,7 @@ void zGatherNRformat_loc3d
     if ( Fact == DOFACT ) { /* Factorize from scratch */
 	/* A3d is output. Compute counts from scratch */
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
 	// find number of nnzs
@@ -109,6 +111,7 @@ void zGatherNRformat_loc3d
 		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
 		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
 		A2d->rowptr[0] = 0;
+		printf(" Gather layer-0: iam %d\n", grid3d->iam); fflush(stdout);
 	    }
 
 	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
@@ -297,7 +300,8 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
     int *row_disp       = A3d->row_disp;
-    int i, p;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
     gridinfo_t *grid2d = &(grid3d->grid2d);
@@ -333,14 +337,14 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		     Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		     0, grid3d->zscp.comm);
 
-    } else { /* Z-major in 3D grid */
+    } else { /* Z-major in 3D grid (default) */
         /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
 	                       0      3      6      9
  	                       1      4      7      10      
 	                       2      5      8      11
 	  GATHER:  {A, B} in A * X = B
 	  layer-0:
-    	       B (row space)  X (column space)  SCATTER
+	  B (row space)  X (column space)  SCATTER
 	       ----           ----        ---->>
            P0  0              0
 (equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
@@ -358,15 +362,172 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       11            11
 	       ----         ----
 	*/
-        MPI_Request recv_req;
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
 	int npdep = grid3d->npdep, dest, src, tag;
-	int nprocs = pxy * npdep;
-
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       along the processes Z-dimension.
+	       row_disp[npdep] is the total number of X entries on my proc.
+	       (equals A2d->m_loc.)
+	       A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       among all the processes.
+	       A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    //printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+		    //src, iam, x_recv_counts[p], tag);
+		    //fflush(stdout);
+		    ++j;
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, SuperLU_MPI_DOUBLE_COMPLEX,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+	    
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
+	
 	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		 src, tag, grid3d->comm, &recv_req);
 
@@ -381,10 +542,12 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
 	    }
 	}  /* end layer 0 send */
-    
+	
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
-
+#endif
+	///////////	
+	
     } /* else Z-major */
 
     // B <- colMajor(Btmp)
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
index a0746dc1..b5b6b361 100644
--- a/SRC/zsuperlu_gpu.cu
+++ b/SRC/zsuperlu_gpu.cu
@@ -775,6 +775,7 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -807,8 +808,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -823,7 +822,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/run_cmake_build.sh b/run_cmake_build.sh
index a5284dde..b318c0ce 100755
--- a/run_cmake_build.sh
+++ b/run_cmake_build.sh
@@ -57,7 +57,10 @@ then
     -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=0 -DDEBUGlevel=0" \
     -DCMAKE_C_COMPILER=mpicc \
     -DCMAKE_CXX_COMPILER=mpicxx \
+    -DCMAKE_CXX_FLAGS="-std=c++11" \
     -DCMAKE_Fortran_COMPILER=mpif90 \
+    -DCMAKE_LINKER=mpicxx \
+    -Denable_openmp=ON \
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \