From fd71062bd8bed168173af0054292b86160c69827 Mon Sep 17 00:00:00 2001
From: Sherry Li <xsli@lbl.gov>
Date: Mon, 20 Sep 2021 20:50:49 -0700
Subject: [PATCH] Updated zScatter_B3d() to support uneven block row partition
 of {A,B} in 3D interface. Only complex16 is implemented.

---
 CMakeLists.txt              |   8 +-
 EXAMPLE/Makefile            |   2 +-
 EXAMPLE/pddrive1.c          |   7 +-
 EXAMPLE/pddrive1_ABglobal.c |   6 ++
 EXAMPLE/pddrive2_ABglobal.c |   7 +-
 EXAMPLE/pddrive3_ABglobal.c |   7 +-
 EXAMPLE/pddrive4.c          |   7 +-
 EXAMPLE/pddrive4_ABglobal.c |   6 ++
 EXAMPLE/pddrive_ABglobal.c  |   7 +-
 EXAMPLE/pddrive_spawn.c     |   7 +-
 EXAMPLE/pzdrive.c           |   7 +-
 EXAMPLE/pzdrive1.c          |   7 +-
 EXAMPLE/pzdrive1_ABglobal.c |   7 +-
 EXAMPLE/pzdrive2_ABglobal.c |   7 +-
 EXAMPLE/pzdrive3_ABglobal.c |   7 +-
 EXAMPLE/pzdrive3d.c         |   1 +
 EXAMPLE/pzdrive4.c          |   7 +-
 EXAMPLE/pzdrive4_ABglobal.c |   6 ++
 EXAMPLE/pzdrive_ABglobal.c  |   7 +-
 EXAMPLE/pzdrive_spawn.c     |   7 +-
 EXAMPLE/zcreate_matrix3d.c  |  51 +++++++---
 SRC/CMakeLists.txt          |   6 ++
 SRC/dlustruct_gpu.h         |   8 +-
 SRC/dnrformat_loc3d.c       |   2 +
 SRC/dsuperlu_gpu.cu         |   4 +-
 SRC/pdgstrf3d.c             |  16 +++-
 SRC/psgstrf3d.c             |   9 +-
 SRC/pzgstrf3d.c             |  16 +++-
 SRC/pzutil.c                |   6 +-
 SRC/slustruct_gpu.h         |   8 +-
 SRC/ssuperlu_gpu.cu         |   4 +-
 SRC/supermatrix.h           |   9 ++
 SRC/zlustruct_gpu.h         |   8 +-
 SRC/znrformat_loc3d.c       | 179 ++++++++++++++++++++++++++++++++++--
 SRC/zsuperlu_gpu.cu         |   4 +-
 run_cmake_build.sh          |   3 +
 36 files changed, 386 insertions(+), 74 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0f13a12a..2d52b6fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 option(enable_doc       "Build doxygen documentation" OFF)
 option(enable_double    "Enable double precision library" ON)
 option(enable_single    "Enable single precision library" OFF)
-option(enable_complex16 "Enable complex16 precision library" OFF)
+option(enable_complex16 "Enable complex16 precision library" ON)
 option(enable_tests  "Build tests" ON)
 option(enable_examples  "Build examples" ON)
 option(XSDK_ENABLE_Fortran "Enable Fortran" ON)
@@ -224,9 +224,9 @@ if (enable_openmp)
   if(OPENMP_FOUND)
     set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
-# On edison, OpenMP_EXE_LINKER_FLAGS is empty
-#  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+# The following causes problem with cmake/3.20.+
+# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
     message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'")
     message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
   endif()
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 37387b52..ca5620b3 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -53,7 +53,7 @@ ZEXM1	= pzdrive1.o zcreate_matrix.o
 ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
 ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
-ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o #znrformat_loc3d.o
 ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXM3D3	= pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index c2ccd067..72c88a6e 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c
index 7f06b70e..7686b79c 100644
--- a/EXAMPLE/pddrive1_ABglobal.c
+++ b/EXAMPLE/pddrive1_ABglobal.c
@@ -72,6 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
 
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c
index 57ebadf3..e908a6ca 100644
--- a/EXAMPLE/pddrive2_ABglobal.c
+++ b/EXAMPLE/pddrive2_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c
index 2e2a7433..e20c664d 100644
--- a/EXAMPLE/pddrive3_ABglobal.c
+++ b/EXAMPLE/pddrive3_ABglobal.c
@@ -78,7 +78,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index ca984a81..dbe9ee08 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -74,7 +74,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c
index 9ff46dd8..2cf76078 100644
--- a/EXAMPLE/pddrive4_ABglobal.c
+++ b/EXAMPLE/pddrive4_ABglobal.c
@@ -71,6 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c
index a47388b5..3541ab92 100644
--- a/EXAMPLE/pddrive_ABglobal.c
+++ b/EXAMPLE/pddrive_ABglobal.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c
index 131ea29c..b119b46e 100755
--- a/EXAMPLE/pddrive_spawn.c
+++ b/EXAMPLE/pddrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index 3878558d..342b91a5 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -74,7 +74,12 @@ int main(int argc, char *argv[])
        ------------------------------------------------------------*/
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
 
 #if ( VAMPIR>=1 )
     VT_traceoff(); 
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index b65733b2..69aea1d0 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c
index 4437e4a8..bf47169f 100644
--- a/EXAMPLE/pzdrive1_ABglobal.c
+++ b/EXAMPLE/pzdrive1_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c
index 9959465b..96866c35 100644
--- a/EXAMPLE/pzdrive2_ABglobal.c
+++ b/EXAMPLE/pzdrive2_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c
index c83cf1a3..144e3f6e 100644
--- a/EXAMPLE/pzdrive3_ABglobal.c
+++ b/EXAMPLE/pzdrive3_ABglobal.c
@@ -77,7 +77,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
index 8bbed679..3e7f1463 100644
--- a/EXAMPLE/pzdrive3d.c
+++ b/EXAMPLE/pzdrive3d.c
@@ -183,6 +183,7 @@ main (int argc, char *argv[])
        INITIALIZE THE SUPERLU PROCESS GRID.
        ------------------------------------------------------------ */
     superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+    //    grid.rankorder = 1;
 
     if(grid.iam==0) {
 	MPI_Query_thread(&omp_mpi_level);
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 7453d2ac..33451140 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c
index 5515e885..9b3ff81b 100644
--- a/EXAMPLE/pzdrive4_ABglobal.c
+++ b/EXAMPLE/pzdrive4_ABglobal.c
@@ -70,6 +70,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c
index c3d798c1..b6f48554 100644
--- a/EXAMPLE/pzdrive_ABglobal.c
+++ b/EXAMPLE/pzdrive_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c
index 30a28dd1..8dab3751 100755
--- a/EXAMPLE/pzdrive_spawn.c
+++ b/EXAMPLE/pzdrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c
index b3c43ffd..5f0f7b6f 100644
--- a/EXAMPLE/zcreate_matrix3d.c
+++ b/EXAMPLE/zcreate_matrix3d.c
@@ -341,17 +341,44 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
     nzval[0] = 0.1;
 #endif
 
-    /* Compute the number of rows to be distributed to local process */
-    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
-    m_loc_fst = m_loc;
-    /* When m / procs is not an integer */
-    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
-    {
-        /*m_loc = m_loc+1;
-          m_loc_fst = m_loc;*/
-        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
-            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
-    }
+//    /* Compute the number of rows to be distributed to local process */
+//    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+//    m_loc_fst = m_loc;
+//    /* When m / procs is not an integer */
+//    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+//    {
+//        /*m_loc = m_loc+1;
+//          m_loc_fst = m_loc;*/
+//        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+//            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+//    }
+ 
+    switch(iam) {
+      case 0:
+       m_loc=111; fst_row=0;
+       break;
+      case 1:
+       m_loc=84; fst_row=111;
+       break;
+      case 2:
+       m_loc=108; fst_row=195;
+       break;
+      case 3:
+       m_loc=84; fst_row=303;
+       break;
+      case 4:
+       m_loc=108; fst_row=387;
+       break;
+      case 5:
+       m_loc=84; fst_row=495;
+       break;
+      case 6:
+       m_loc=108; fst_row=579;
+       break;
+      case 7:
+       m_loc=84; fst_row=687;
+       break;
+     }
 
     /* Create compressed column matrix for GA. */
     zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
@@ -379,7 +406,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
         for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
     /* Set up row pointers */
     rowptr[0] = 0;
-    fst_row = iam * m_loc_fst;
+//    fst_row = iam * m_loc_fst;
     nnz_loc = 0;
     for (j = 0; j < m_loc; ++j)
     {
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 51f10a42..ef071485 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -294,6 +294,12 @@ if(CUDAToolkit_FOUND)  # this is found in top-level CMakeLists.txt
     target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas)
 endif()
 
+# This is recommended by modern cmake:
+# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html
+if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt
+  target_link_libraries(superlu_dist OpenMP::OpenMP_C)
+endif()
+  
 target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS)
 if(MSVC AND BUILD_SHARED_LIBS)
   set_target_properties(superlu_dist PROPERTIES
diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h
index 23187e66..1cdc366d 100644
--- a/SRC/dlustruct_gpu.h
+++ b/SRC/dlustruct_gpu.h
@@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     double  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     double  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index 625df8ea..ae1f1a86 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -80,6 +80,8 @@ void dGatherNRformat_loc3d
 	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
 	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
 	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+
+	/* Gathered to layer 0. Other procs do not have these counts */
 	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
 		   1, mpi_int_t, 0, grid3d->zscp.comm);
 	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
index 93c72c18..832879d2 100644
--- a/SRC/dsuperlu_gpu.cu
+++ b/SRC/dsuperlu_gpu.cu
@@ -766,6 +766,7 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -798,8 +799,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -814,7 +813,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index dbef9da9..0deebd74 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -15,8 +15,9 @@ at the top-level directory.
  *
  * <pre>
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -225,14 +226,14 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    dLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // dLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -339,8 +340,13 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    dfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index f071d593..2c73580d 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -233,7 +233,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    sLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // sLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -340,8 +340,13 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    sfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 88eafa21..03e60b2e 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * <pre>
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -224,14 +225,14 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    zLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // zLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -338,8 +339,13 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    zfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index c784c497..cd5c69e4 100644
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -810,6 +810,10 @@ void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     SUPERLU_FREE(A3d->nnz_disp);
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
 } /* zDestroy_A3d_gathered_on_2d */
@@ -889,5 +893,3 @@ zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
     CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
 #endif
 }
-
-
diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h
index 48038dbf..9475fba8 100644
--- a/SRC/slustruct_gpu.h
+++ b/SRC/slustruct_gpu.h
@@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     float  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     float  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
index adb67693..78b70b83 100644
--- a/SRC/ssuperlu_gpu.cu
+++ b/SRC/ssuperlu_gpu.cu
@@ -766,6 +766,7 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -798,8 +799,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -814,7 +813,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 9913aa6b..1d720355 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -205,6 +205,15 @@ typedef struct NRformat_loc3d
     int *nnz_disp;
     int *b_counts_int;
     int *b_disp;
+
+    /* The following 4 structures are used for scattering
+       solution X from 2D grid-0 back to 3D processes */
+    int num_procs_to_send;  
+    int *procs_to_send_list;
+    int *send_count_list;
+    int num_procs_to_recv;
+    int *procs_recv_from_list;
+    int *recv_count_list;
 } NRformat_loc3d;
 
 
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
index 1064d424..39288000 100644
--- a/SRC/zlustruct_gpu.h
+++ b/SRC/zlustruct_gpu.h
@@ -77,8 +77,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     doublecomplex  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -86,7 +86,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     doublecomplex  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -108,7 +109,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 89de8dd0..f93cb215 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -22,6 +22,7 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
+#include <assert.h>
 
 /* Dst <- BlockByBlock (Src), reshape the block storage. */
 static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
@@ -66,6 +67,7 @@ void zGatherNRformat_loc3d
     if ( Fact == DOFACT ) { /* Factorize from scratch */
 	/* A3d is output. Compute counts from scratch */
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
 	// find number of nnzs
@@ -109,6 +111,7 @@ void zGatherNRformat_loc3d
 		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
 		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
 		A2d->rowptr[0] = 0;
+		printf(" Gather layer-0: iam %d\n", grid3d->iam); fflush(stdout);
 	    }
 
 	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
@@ -297,7 +300,8 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
     int *row_disp       = A3d->row_disp;
-    int i, p;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
     gridinfo_t *grid2d = &(grid3d->grid2d);
@@ -333,14 +337,14 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		     Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		     0, grid3d->zscp.comm);
 
-    } else { /* Z-major in 3D grid */
+    } else { /* Z-major in 3D grid (default) */
         /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
 	                       0      3      6      9
  	                       1      4      7      10      
 	                       2      5      8      11
 	  GATHER:  {A, B} in A * X = B
 	  layer-0:
-    	       B (row space)  X (column space)  SCATTER
+	  B (row space)  X (column space)  SCATTER
 	       ----           ----        ---->>
            P0  0              0
 (equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
@@ -358,15 +362,172 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       11            11
 	       ----         ----
 	*/
-        MPI_Request recv_req;
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
 	int npdep = grid3d->npdep, dest, src, tag;
-	int nprocs = pxy * npdep;
-
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       along the processes Z-dimension.
+	       row_disp[npdep] is the total number of X entries on my proc.
+	       (equals A2d->m_loc.)
+	       A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       among all the processes.
+	       A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    //printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+		    //src, iam, x_recv_counts[p], tag);
+		    //fflush(stdout);
+		    ++j;
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, SuperLU_MPI_DOUBLE_COMPLEX,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+	    
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
+	
 	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		 src, tag, grid3d->comm, &recv_req);
 
@@ -381,10 +542,12 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
 	    }
 	}  /* end layer 0 send */
-    
+	
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
-
+#endif
+	///////////	
+	
     } /* else Z-major */
 
     // B <- colMajor(Btmp)
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
index a0746dc1..b5b6b361 100644
--- a/SRC/zsuperlu_gpu.cu
+++ b/SRC/zsuperlu_gpu.cu
@@ -775,6 +775,7 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -807,8 +808,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -823,7 +822,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/run_cmake_build.sh b/run_cmake_build.sh
index a5284dde..b318c0ce 100755
--- a/run_cmake_build.sh
+++ b/run_cmake_build.sh
@@ -57,7 +57,10 @@ then
     -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=0 -DDEBUGlevel=0" \
     -DCMAKE_C_COMPILER=mpicc \
     -DCMAKE_CXX_COMPILER=mpicxx \
+    -DCMAKE_CXX_FLAGS="-std=c++11" \
     -DCMAKE_Fortran_COMPILER=mpif90 \
+    -DCMAKE_LINKER=mpicxx \
+    -Denable_openmp=ON \
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \