diff --git a/.ci_tests.sh b/.ci_tests.sh
new file mode 100755
index 00000000..e9664161
--- /dev/null
+++ b/.ci_tests.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+set -e
+
+export RED="\033[31;1m"
+export BLUE="\033[34;1m"
+export ROOT_DIR="$PWD"
+printf "${BLUE} SLU; Entered tests file:\n"
+
+
+export DATA_FOLDER=$ROOT_DIR/EXAMPLE
+export EXAMPLE_FOLDER=$ROOT_DIR/build/EXAMPLE
+export TEST_FOLDER=$ROOT_DIR/build/TEST
+
+case "${TEST_NUMBER}" in
+1)  mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+2)  mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+3)  mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+4)  mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+5)  mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+6)  mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+7)  mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+8)  mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+9)  mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+10) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+11) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+12) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+13) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+14) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+15) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive_ABglobal" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+*) printf "${RED} ###SLU: Unknown test\n" ;;
+esac
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..391f4adc
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,97 @@
+name: Run Github CI tests. 
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    name: Build and test
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        compiler:
+          - gcc
+        test: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Install dependencies
+        run: |
+          export BLUE="\033[34;1m"
+          mkdir -p installDir
+
+          printf "${BLUE} SLU; Installing gcc-9 via apt\n"
+          sudo apt-get update
+          sudo apt-get install build-essential software-properties-common -y
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt-get update
+          sudo apt-get install gcc-9 g++-9 -y
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9
+          export CXX="g++-9"
+          export CC="gcc-9"
+          printf "${BLUE} SLU; Done installing gcc-9 via apt\n"
+
+          printf "${BLUE} SLU; Installing gfortran via apt\n"
+          sudo apt-get install gfortran-9 -y
+          sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-9 60
+          printf "${BLUE} SLU; Done installing gfortran via apt\n"
+
+          printf "${BLUE} SLU; Installing openmpi\n"
+          sudo apt-get install openmpi-bin libopenmpi-dev
+          printf "${BLUE} SLU; Done installing openmpi\n"
+
+          printf "${BLUE} SLU; Installing BLASfrom apt\n"
+          sudo apt-get install libblas-dev
+          export BLAS_LIB=/usr/lib/libblas/libblas.so
+          printf "${BLUE} SLU; Done installing BLASfrom apt\n"
+
+          printf "${BLUE} SLU; Installing LAPACKfrom apt\n"
+          sudo apt-get install liblapack-dev
+          export LAPACK_LIB=/usr/lib/liblapack.so
+          printf "${BLUE} SLU; Done installing LAPACKfrom apt\n"
+
+          printf "${BLUE} SLU; Installing ParMetis-4.0 from source\n"
+          cd $GITHUB_WORKSPACE/installDir
+          wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/parmetis/parmetis-4.0.3.tar.gz
+          tar -xf parmetis-4.0.3.tar.gz
+          cd parmetis-4.0.3/
+          mkdir -p install
+          make config shared=1 cc=mpicc cxx=mpic++ prefix=$PWD/install
+          make install > make_parmetis_install.log 2>&1
+          printf "${BLUE} SLU; Done installing ParMetis-4.0 from source\n"
+
+      - name: Install package
+        run: |
+          export BLUE="\033[34;1m"
+          printf "${BLUE} SLU; Installing superlu_dist from source\n"
+          cd $GITHUB_WORKSPACE
+          rm -rf build
+          mkdir -p build
+          cd build 
+          cmake .. \
+          -DTPL_PARMETIS_INCLUDE_DIRS="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/metis/include;$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/include" \
+          -DTPL_PARMETIS_LIBRARIES="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/lib/libparmetis.so" \
+          -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=1" \
+          -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \
+          -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \
+          -DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \
+          -Denable_blaslib=OFF \
+          -DBUILD_SHARED_LIBS=OFF \
+          -DCMAKE_C_COMPILER=mpicc \
+          -DCMAKE_CXX_COMPILER=mpic++ \
+          -DCMAKE_INSTALL_PREFIX=. \
+          -DCMAKE_BUILD_TYPE=Debug \
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          make
+          make install
+          printf "${BLUE} SLU; Done installing superlu_dist from source\n"
+    
+      - name: Test
+        run: |
+          cd $GITHUB_WORKSPACE
+          export TEST_NUMBER=${{ matrix.test }}
+          ./.ci_tests.sh
+
+
diff --git a/.gitignore b/.gitignore
index cd5d67bb..39be29c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,11 @@
 
 # You have to ignore this generated file or git will complain that it is an
 # unknown file!
-/make.inc
+make.inc
 
 # If the instructions are telling people to create this build dir under the
 # source tree, you had better put in an ignore for this.
-#/build/*
+build/*
+
+# not to commit any changes to the following file
+FORTRAN/superlu_dist_config.fh
diff --git a/.travis.yml b/.travis.yml
index af98f264..3e758231 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -111,4 +111,4 @@ install:
 
 script: 
   - cd $TRAVIS_BUILD_DIR
-  - ./.travis_tests.sh
+  - ./.ci_tests.sh
diff --git a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt
index 3d259fef..b0be69b7 100644
--- a/CBLAS/CMakeLists.txt
+++ b/CBLAS/CMakeLists.txt
@@ -46,5 +46,5 @@ endif()
 
 add_library(blas ${sources} ${HEADERS})
 
-install(TARGETS blas DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+install(TARGETS blas DESTINATION "${INSTALL_LIB_DIR}")
+install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04dda0b6..6de4c56d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,14 +5,14 @@
 ######################################################################
 
 # Required version
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18.1 FATAL_ERROR)
 
 # Project version numbers
 #project(SuperLU_DIST C CXX CUDA)
 project(SuperLU_DIST C CXX)
-set(VERSION_MAJOR "6")
-set(VERSION_MINOR "4")
-set(VERSION_BugFix "0")
+set(VERSION_MAJOR "7")
+set(VERSION_MINOR "1")
+set(VERSION_BugFix "1")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
@@ -20,19 +20,27 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Set up options
 option(enable_doc       "Build doxygen documentation" OFF)
 option(enable_double    "Enable double precision library" ON)
+option(enable_single    "Enable single precision library" OFF)
 option(enable_complex16 "Enable complex16 precision library" ON)
 option(enable_tests  "Build tests" ON)
 option(enable_examples  "Build examples" ON)
+option(XSDK_ENABLE_Fortran "Enable Fortran" ON)
+
+#-- BLAS
 option(TPL_ENABLE_INTERNAL_BLASLIB  "Build the CBLAS library" ${enable_blaslib_DEFAULT})
 option(TPL_BLAS_LIBRARIES "List of absolute paths to blas libraries [].")
-option(TPL_ENABLE_LAPACKLIB "Enable lapack library" ON)
-option(TPL_LAPACK_LIBRARIES "List of absolute paths to lapack libraries [].")
+#-- ParMETIS
 option(TPL_ENABLE_PARMETISLIB   "Build the ParMETIS library" ON)
 option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
 option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
-option(TPL_ENABLE_COMBBLASLIB   "BUILD THE COMBBLAS LIBRARY" OFF)
-OPTION(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].")
+#-- LAPACK
+option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" OFF)
+option(TPL_LAPACK_LIBRARIES "List of absolute paths to LAPACK libraries [].")
+#-- CombBLAS
+option(TPL_ENABLE_COMBBLASLIB   "Build the CombBLAS library" OFF)
+option(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].")
 option(TPL_COMBBLAS_INCLUDE_DIRS "List of absolute paths to CombBLAS include directories [].")
+#-- CUDA
 option(TPL_ENABLE_CUDALIB   "Enable the CUDA libraries" OFF)
 option(TPL_ENABLE_HIPLIB   "Enable the HIP libraries" OFF)
 
@@ -40,17 +48,30 @@ option(TPL_ENABLE_HIPLIB   "Enable the HIP libraries" OFF)
 #
 # IDEAS: xSDK standards module
 #MESSAGE("\nProcess XSDK defaults ...")
-#SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
+# SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired
 #INCLUDE("cmake/XSDKDefaults.cmake")
 ######################################################################
 
-INCLUDE(CTest)
+include(CTest)
+include(CheckLanguage)
 
 ######################################################################
 #
 # Usual initialization stuff
 #
 ######################################################################
+set(CMAKE_CXX_STANDARD 11)
+#set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
+
+if (XSDK_ENABLE_Fortran)
+  enable_language (Fortran)
+  set(NOFORTRAN FALSE)
+endif()
+
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)    ## ????
 set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 
 #---- For shared library
@@ -80,25 +101,17 @@ if (BUILD_SHARED_LIBS)
 
   set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.so)
   SET(CMAKE_EXE_LINKER_FLAGS
-          "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC")
+      "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC")
 
   if (BUILD_STATIC_LIBS)
     message("-- SuperLU_DIST will also be built as a static library.")
   endif()
+  set(SHARED_C_FLAGS_EXPORT ${CMAKE_SHARED_LIBRARY_C_FLAGS})
 else()
   message("-- SuperLU_DIST will be built as a static library.")
   set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.a)
 endif()
 
-set(CMAKE_CXX_STANDARD 11)
-#set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
-
-if (XSDK_ENABLE_Fortran)
-  enable_language (Fortran)
-  set(NOFORTRAN FALSE)
-endif()
 set(SUPERLU_VERSION "${PROJECT_VERSION}")
 set(SUPERLU_REV "${PROJECT_REV}")
 
@@ -113,13 +126,12 @@ if (NOT CMAKE_INSTALL_PREFIX)
   set(CMAKE_INSTALL_PREFIX /usr/local)
 endif()
 
-
 if(NOT MSVC)
   include(GNUInstallDirs)
   set(default_install_inc_dir ${CMAKE_INSTALL_INCLUDEDIR})
   set(default_install_lib_dir ${CMAKE_INSTALL_LIBDIR})
   set(default_install_bin_dir ${CMAKE_INSTALL_BINDIR})
-else()
+else() # for Windows
   set(default_install_inc_dir "include")
   set(default_install_lib_dir "lib")
   set(default_install_bin_dir "bin")
@@ -131,13 +143,15 @@ set(INSTALL_BIN_DIR "${default_install_bin_dir}" CACHE STRING "The folder where
 
 # Set up required compiler defines and options.
 ## get_directory_property( DirDefs COMPILE_DEFINITIONS )
-# set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}")
-# set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 if(XSDK_INDEX_SIZE EQUAL 64)
     message("-- Using 64 bit integer for index size.")
 endif()	
-set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
+set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "")
+set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -g" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}  -O0")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0")
 
 ######################################################################
 #
@@ -155,12 +169,53 @@ if(MPI_C_FOUND)
 endif()
 if (XSDK_ENABLE_Fortran)
   if(MPI_Fortran_FOUND)
-    add_definitions(${MPI_Fortran_COMPILE_FLAGS})
-    include_directories(${MPI_Fortran_INCLUDE_PATH})
     set(CMAKE_Fortran_FLAGS "${MPI_Fortran_COMPILE_FLAGS} ${CMAKE_Fortran_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_Fortran_LINK_FLAGS}")
+    include_directories(${MPI_Fortran_INCLUDE_PATH})
   endif()
 endif()  
+
+#---- CUDA libraries
+if (TPL_ENABLE_CUDALIB)   ## want to use cuda
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    message("-- Enabled support for CUDA.")
+    enable_language(CUDA)
+    find_package(CUDA REQUIRED)
+    if (CUDA_FOUND)
+      if (NOT CMAKE_CUDA_FLAGS)
+        cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto)
+      endif()
+      set(HAVE_CUDA TRUE)
+      set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG" CACHE STRING "")
+      set(CMAKE_CUDA_FLAGS_DDEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g" CACHE STRING "")
+    endif()
+
+#     find_package(CUB REQUIRED)
+
+    find_package(CUDAToolkit REQUIRED)
+    message("-- CUDAToolkit_LIBRARY_ROOT='${CUDAToolkit_LIBRARY_ROOT}'")
+    if (NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "")
+    set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so")
+    set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so")
+    else()
+    message("-- CUDAToolkit_LIBRARY_ROOT empty, not setting CUDA_LIBRARIES")
+    endif()
+
+
+#     # The following make.inc exporting does not work
+#     set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver)
+# # fix up CUDA library names    
+#     string (REPLACE ";" " " CUDA_LIB_STR "${CUDA_LIB}")
+#     set(CUDA_LIB_EXPORT ${CUDA_LIB_STR}) 
+#     set(HAVE_CUDA TRUE)
+    # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_CUDA")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDA")
+  else()
+    message("-- CUDA libraries not found.")
+  endif()
+endif()
+
 #--------------------- OpenMP ---------------------
 if (NOT DEFINED enable_openmp)
   set(enable_openmp TRUE)
@@ -171,9 +226,9 @@ if (enable_openmp)
   if(OPENMP_FOUND)
     set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
-# On edison, OpenMP_EXE_LINKER_FLAGS is empty
-#  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+# The following causes problem with cmake/3.20.+
+# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
     message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'")
     message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
   endif()
@@ -275,9 +330,9 @@ else()
     add_subdirectory(CBLAS)
     set(BLAS_LIB blas)
     if (BUILD_SHARED_LIBS)  # export to be referenced by downstream makefile
-        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so)
+        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.so)
     else()
-        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.a)
+        set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.a)
     endif()
 endif()
 
@@ -349,8 +404,6 @@ if(PARMETIS_FOUND)
   set(HAVE_PARMETIS TRUE)
 endif()
 
-
-
 #---------------------- Additional C linker library ---------
 SET(_c_libs ${CMAKE_C_IMPLICIT_LINK_LIBRARIES})
 FOREACH(_lib ${_c_libs})
@@ -369,6 +422,16 @@ if (XSDK_ENABLE_Fortran)
    string (REPLACE ";" " " EXTRA_FLIB_STR "${EXTRA_FLIB}")
    set(EXTRA_FLIB_EXPORT ${EXTRA_FLIB_STR})
    message("-- EXTRA_FLIB_EXPORT='${EXTRA_FLIB_EXPORT}'")
+   
+   if (BUILD_SHARED_LIBS)
+      message("-- superlu_dist_fortran will be built as a dynamic library.")
+      set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.so)
+      SET(CMAKE_EXE_LINKER_FLAGS
+          "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+   else()
+      message("-- superlu_dist_fortran will be built as a static library.")
+      set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.a)
+   endif()
 endif()
 
 #--------------------- CombBLAS ---------------------
@@ -389,6 +452,7 @@ if (TPL_ENABLE_COMBBLASLIB)   ## want to use CombBLAS
 
   message("-- Enabled support for COMBBLAS")
   set(COMBBLAS_FOUND TRUE)
+  set(CMAKE_CXX_STANDARD 14)  # CombBLAS requires c++14
 
   set(COMBBLAS_LIB ${TPL_COMBBLAS_LIBRARIES})
   # fix up COMBBLAS library names
@@ -414,8 +478,9 @@ endif()
 if (XSDK_ENABLE_Fortran)
   include(FortranCInterface)
   FortranCInterface_HEADER(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_FortranCInterface.h
-  MACRO_NAMESPACE "FC_")
+                            MACRO_NAMESPACE "FC_")
   FortranCInterface_VERIFY(CXX)
+  SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 endif()
 
 ######################################################################
@@ -458,19 +523,23 @@ if(enable_examples)
 endif()
 
 if (XSDK_ENABLE_Fortran)
-#  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lstdc++")
-  SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
   add_subdirectory(FORTRAN)
 endif()
 
 # superlu_dist uses c++11. PUBLIC means that the other codes linking to it need c++11
 #target_compile_features(SuperLU_DIST PUBLIC cxx_std_11)
 
+# Generate various configure files with proper definitions
 # configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc)
 configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc)
+
 configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/SRC/superlu_dist_config.h)
 configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h)
 
+# Following is to configure a file for FORTRAN code
+configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/FORTRAN/superlu_dist_config.h)
+
+
 # Add pkg-config support
 if(IS_ABSOLUTE ${CMAKE_INSTALL_LIBDIR})
   set(pkgconfig_libdir ${CMAKE_INSTALL_LIBDIR})
@@ -481,4 +550,4 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BI
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc
 	DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 
-#message("CMAKE_CXX_LINK_FLAGS '${CMAKE_CXX_LINK_FLAGS}'")
+#message("MPI_Fortran_LINK_FLAGS '${MPI_Fortran_LINK_FLAGS}'")
diff --git a/DoxyConfig b/DoxyConfig
index e50bb432..2b18411f 100644
--- a/DoxyConfig
+++ b/DoxyConfig
@@ -31,8 +31,8 @@ PROJECT_NAME           = SuperLU Distributed
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = 6.3.0
-e
+PROJECT_NUMBER         = 7.1.0
+
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
 # If a relative path is entered, it will be relative to the location 
@@ -847,7 +847,7 @@ TREEVIEW_WIDTH         = 250
 # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
 # generate Latex output.
 
-GENERATE_LATEX         = NO
+GENERATE_LATEX         = YES
 
 # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be 
@@ -858,7 +858,7 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
 # invoked. If left blank `latex' will be used as the default command name.
 
-LATEX_CMD_NAME         = latex
+LATEX_CMD_NAME         = pdflatex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
 # generate index for LaTeX. If left blank `makeindex' will be used as the 
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
old mode 100644
new mode 100755
index 1253eb62..b4c6b371
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -37,6 +37,7 @@ if(enable_double)
   set(DEXM pddrive.c dcreate_matrix.c)
   add_executable(pddrive ${DEXM})
   target_link_libraries(pddrive ${all_link_libs})
+  install(TARGETS pddrive RUNTIME DESTINATION "${INSTALL_BIN_DIR}")  
 
   set(DEXM1 pddrive1.c dcreate_matrix.c)
   add_executable(pddrive1 ${DEXM1})
@@ -57,6 +58,22 @@ if(enable_double)
   add_executable(pddrive4 ${DEXM4})
   target_link_libraries(pddrive4 ${all_link_libs})
 
+  set(DEXM3D pddrive3d.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d ${DEXM3D})
+  target_link_libraries(pddrive3d ${all_link_libs})
+
+  set(DEXM3D1 pddrive3d1.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d1 ${DEXM3D1})
+  target_link_libraries(pddrive3d1 ${all_link_libs})
+  
+  set(DEXM3D2 pddrive3d2.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d2 ${DEXM3D2})
+  target_link_libraries(pddrive3d2 ${all_link_libs})
+  
+  set(DEXM3D3 pddrive3d3.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d3 ${DEXM3D3})
+  target_link_libraries(pddrive3d3 ${all_link_libs})
+  
   set(DEXMG pddrive_ABglobal.c)
   add_executable(pddrive_ABglobal ${DEXMG})
   target_link_libraries(pddrive_ABglobal ${all_link_libs})
@@ -80,10 +97,53 @@ if(enable_double)
   set(DEXMS pddrive_spawn.c dcreate_matrix.c)
   add_executable(pddrive_spawn ${DEXMS})
   target_link_libraries(pddrive_spawn ${all_link_libs})
-
-
-endif()
-
+  install(TARGETS pddrive_spawn RUNTIME DESTINATION "${INSTALL_BIN_DIR}")
+
+
+
+endif() #### end enable_double
+
+if(enable_single)
+  set(SEXM psdrive.c screate_matrix.c)
+  add_executable(psdrive ${SEXM})
+  target_link_libraries(psdrive ${all_link_libs})
+
+  set(SEXM1 psdrive1.c screate_matrix.c)
+  add_executable(psdrive1 ${SEXM1})
+  target_link_libraries(psdrive1 ${all_link_libs})
+  add_superlu_dist_example(psdrive1 big.rua 2 2)
+
+  set(SEXM2 psdrive2.c screate_matrix.c screate_matrix_perturbed.c)
+  add_executable(psdrive2 ${SEXM2})
+  target_link_libraries(psdrive2 ${all_link_libs})
+  add_superlu_dist_example(psdrive2 big.rua 2 2)
+
+  set(SEXM3 psdrive3.c screate_matrix.c)
+  add_executable(psdrive3 ${SEXM3})
+  target_link_libraries(psdrive3 ${all_link_libs})
+  add_superlu_dist_example(psdrive3 big.rua 2 2)
+
+  set(SEXM4 psdrive4.c screate_matrix.c)
+  add_executable(psdrive4 ${SEXM4})
+  target_link_libraries(psdrive4 ${all_link_libs})
+
+  set(SEXM3D psdrive3d.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d ${SEXM3D})
+  target_link_libraries(psdrive3d ${all_link_libs})
+
+  set(SEXM3D1 psdrive3d1.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d1 ${SEXM3D1})
+  target_link_libraries(psdrive3d1 ${all_link_libs})
+  
+  set(SEXM3D2 psdrive3d2.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d2 ${SEXM3D2})
+  target_link_libraries(psdrive3d2 ${all_link_libs})
+  
+  set(SEXM3D3 psdrive3d3.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d3 ${SEXM3D3})
+  target_link_libraries(psdrive3d3 ${all_link_libs})
+  
+endif() #### end enable_single
 
 if(enable_complex16)
 
@@ -110,6 +170,22 @@ if(enable_complex16)
   add_executable(pzdrive4 ${ZEXM4})
   target_link_libraries(pzdrive4 ${all_link_libs})
 
+  set(ZEXM3D pzdrive3d.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d ${ZEXM3D})
+  target_link_libraries(pzdrive3d ${all_link_libs})
+
+  set(ZEXM3D1 pzdrive3d1.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d1 ${ZEXM3D1})
+  target_link_libraries(pzdrive3d1 ${all_link_libs})
+  
+  set(ZEXM3D2 pzdrive3d2.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d2 ${ZEXM3D2})
+  target_link_libraries(pzdrive3d2 ${all_link_libs})
+  
+  set(ZEXM3D3 pzdrive3d3.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d3 ${ZEXM3D3})
+  target_link_libraries(pzdrive3d3 ${all_link_libs})
+  
   set(ZEXMG pzdrive_ABglobal.c)
   add_executable(pzdrive_ABglobal ${ZEXMG})
   target_link_libraries(pzdrive_ABglobal ${all_link_libs})
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 0af33c41..6fdf47e2 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -30,13 +30,18 @@
 #######################################################################
 include ../make.inc
 
-DEXM	= pddrive.o dcreate_matrix.o sp_ienv.o #pdgstrf2.o 
-#pdgssvx.o
-# pdgstrs_lsum_X1.o pdgstrf_X1.o
+DEXM	= pddrive.o dcreate_matrix.o #pdgstrf2.o
 DEXM1	= pddrive1.o dcreate_matrix.o
 DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
 DEXM3	= pddrive3.o dcreate_matrix.o
 DEXM4	= pddrive4.o dcreate_matrix.o
+
+DEXM3D	= pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o 
+DEXM3D1	= pddrive3d1.o dcreate_matrix.o dcreate_matrix3d.o 
+DEXM3D2	= pddrive3d2.o dcreate_matrix.o dcreate_matrix3d.o
+DEXM3D3	= pddrive3d3.o dcreate_matrix.o dcreate_matrix3d.o
+
+#	   dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o
 DEXMG	= pddrive_ABglobal.o
 DEXMG1	= pddrive1_ABglobal.o
 DEXMG2	= pddrive2_ABglobal.o
@@ -48,6 +53,11 @@ ZEXM1	= pzdrive1.o zcreate_matrix.o
 ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
 ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
+ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o 
+ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D3	= pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
+
 ZEXMG	= pzdrive_ABglobal.o
 ZEXMG1	= pzdrive1_ABglobal.o
 ZEXMG2	= pzdrive2_ABglobal.o
@@ -58,14 +68,16 @@ ZEXMG4	= pzdrive4_ABglobal.o
 all: double complex16
 
 double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
+	   pddrive3d pddrive3d1 pddrive3d2 pddrive3d3 \
 	   pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \
 	   pddrive3_ABglobal pddrive4_ABglobal
 
 complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
+	   pzdrive3d pzdrive3d1 pzdrive3d2 pzdrive3d3 \
 	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
 	   pzdrive3_ABglobal pzdrive4_ABglobal
 
-pddrive: $(DEXM) $(DSUPERLULIB)
+pddrive: $(DEXM) $(DSUPERLULIB) 
 	$(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@
 
 pddrive1: $(DEXM1) $(DSUPERLULIB)
@@ -80,6 +92,18 @@ pddrive3: $(DEXM3) $(DSUPERLULIB)
 pddrive4: $(DEXM4) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@
 
+pddrive3d: $(DEXM3D) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@
+
+pddrive3d1: $(DEXM3D1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D1) $(LIBS) -lm -o $@
+
+pddrive3d2: $(DEXM3D2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D2) $(LIBS) -lm -o $@
+
+pddrive3d3: $(DEXM3D3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D3) $(LIBS) -lm -o $@
+
 pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@
 
@@ -113,6 +137,18 @@ pzdrive3: $(ZEXM3) $(DSUPERLULIB)
 pzdrive4: $(ZEXM4) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@
 
+pzdrive3d: $(ZEXM3D) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D) $(LIBS) -lm -o $@
+
+pzdrive3d1: $(ZEXM3D1) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D1) $(LIBS) -lm -o $@
+
+pzdrive3d2: $(ZEXM3D2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D2) $(LIBS) -lm -o $@
+
+pzdrive3d3: $(ZEXM3D3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D3) $(LIBS) -lm -o $@
+
 pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@
 
diff --git a/EXAMPLE/README b/EXAMPLE/README
index b8fa138c..110484d2 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -1,11 +1,25 @@
-		SuperLU_DIST  EXAMPLES
-		======================
+		SuperLU_DIST EXAMPLES
+		=====================
 
 This directory contains sample programs to illustrate how to use
 various functions provided in SuperLU_DIST. You can modify these
 examples to suit your applications.
+(double real: pddrive*
+ double complex: pzdrive* )
 
 The examples illustrate the following functionalities:
+  0. pddrive3d.c: (invoke new communication-avoiding 3D algorithms)
+         Use PxGSSVX3D with the default options to solve a linear system
+     pddrive3d1.c:
+	 Use PxGSSVX3D to solve the systems with same A but different
+	 right-hand side. (Reuse the factored form of A)
+     pddrive3d2.c:
+	 Use PxGSSVX3D to solve the systems with same sparsity pattern
+	 of A. (Reuse the sparsity ordering)
+     pddrive3d2.c:
+	 Use PxGSSVX3D to solve the systems with same sparsity pattern
+	 and similar values. (Reuse sparsity ordering and row pivoting)
+     
   1. pddrive.c, pddrive_ABglobal.c
      Use PDGSSVX with the full (default) options to solve a linear system.
   2. pddrive1.c, pddrive1_ABglobal.c
@@ -16,17 +30,22 @@ The examples illustrate the following functionalities:
      (Reuse the sparsity ordering)
   4. pddrive3.c, pddrive3_ABglobal.c
      Solve the systems with the same sparsity pattern and similar values.
+     (Reuse sparsity ordering and row pivoting)     
   5. pddrive4.c, pddrive4_ABglobal.c
      Divide the processors into two subgroups (two grids) such that each
      subgroup solves a linear system independently from the other.
 
 
 The command line options "-r <process rows>" and "-c <process columns>"
-defines the 2-D process grid. The total number of processes <procs> is:
+defines the 2D process grid. The total number of processes <procs> is:
 	<procs> = <process rows> * <process columns>
 If the options is not provided at the command line, the programs
 will use 1 processor as default in each case.
 
+In the 3D code pddrive3d, the command line
+   "-r <process rows>", "-c <process columns>" and "-d <process Z-dimension>"
+defines the 3D process grid. The Z-dimension must be power of two.
+
 Three input matrices (Harwell-Boeing format) are provided in this directory:
 	g20.rua  -- a real matrix of dimension 400x400
 	big.rua  -- a real matrix of dimension 4960x4960
@@ -36,6 +55,10 @@ The command lines given below show how to run the parallel programs
 using "mpiexec". You may need to replace mpiexec by platform specific
 command.
 
+0. To run the 3D example (pddrive3d), type
+   % mpiexec -n <np> pddrive3d -r <process row> -c <process columns> -d <process Z-replication> g20.rua 
+     (e.g., mpiexec -n 8 pddrive3d -r 2 -c 2 -d 2 g20.rua)
+
 1. To run the real examples (pddrive, pddrive1, etc.)
    you may type:
    % mpiexec -n <np> pddrive -r <process row> -c <process columns> g20.rua 
diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c
index 1784949b..8486f419 100644
--- a/EXAMPLE/dcreate_matrix.c
+++ b/EXAMPLE/dcreate_matrix.c
@@ -235,7 +235,6 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs,
 }
 
 
-
 int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs,
                    int *ldb, double **x, int *ldx,
                    FILE *fp, char * postfix, gridinfo_t *grid)
diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c
new file mode 100644
index 00000000..6e9ccc21
--- /dev/null
+++ b/EXAMPLE/dcreate_matrix3d.c
@@ -0,0 +1,463 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * DCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
+                     int *ldb, double **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    double   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doubleMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    dGenXtrue_dist(n, nrhs, xtrue_global, n);
+    dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_D, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
+
+
+int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
+                           int *ldb, double **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    double   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix_postfix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        if (!strcmp(postfix, "rua"))
+        {
+            /* Read the matrix stored on disk in Harwell-Boeing format. */
+            dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "mtx"))
+        {
+            /* Read the matrix stored on disk in Matrix Market format. */
+            dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "rb"))
+        {
+            /* Read the matrix stored on disk in Rutherford-Boeing format. */
+            dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "dat"))
+        {
+            /* Read the matrix stored on disk in triplet format. */
+            dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "datnh"))
+        {
+            /* Read the matrix stored on disk in triplet format (without header). */
+            dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "bin"))
+        {
+            /* Read the matrix stored on disk in binary format. */
+            dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else
+        {
+            ABORT("File format not known");
+        }
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doubleMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    dGenXtrue_dist(n, nrhs, xtrue_global, n);
+    dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_D, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/dnrformat_loc3d.c b/EXAMPLE/dnrformat_loc3d.c
new file mode 100644
index 00000000..1fbaca69
--- /dev/null
+++ b/EXAMPLE/dnrformat_loc3d.c
@@ -0,0 +1,404 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void dGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ double *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->rowptr[0] = 0;
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	
+	if (grid3d->zscp.Iam == 0) {
+		A2d->rowptr[0] = 0;
+		
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	} /* end 2D layer grid-0 */
+		    
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	double *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	double *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+		    b_counts_int, b_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, p;
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    double *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    double *Btmp; // on 3D grid
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
+		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+	*/
+        MPI_Request recv_req;
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep;
+
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* dScatter_B3d */
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 4107abaa..10bc8091 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -23,7 +23,6 @@ at the top-level directory.
 
 #include <math.h>
 #include "superlu_ddefs.h"
-//#include "superlu_zdefs.h"
 
 /*! \brief
  *
@@ -140,7 +139,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( (iam >= nprow * npcol) || (iam == -1) ) goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -198,14 +197,16 @@ int main(int argc, char *argv[])
 	options.DiagInv           = NO;
      */
     set_default_options_dist(&options);
-	options.IterRefine = NOREFINE;
-	options.DiagInv = YES;
-    options.ReplaceTinyPivot  = YES;
+	// options.IterRefine = NOREFINE;
+	// options.DiagInv = YES;
+    // options.ReplaceTinyPivot  = YES;
+    
 	// options.Equil = NO; 
 	// options.ColPerm = NATURAL;
 	// options.RowPerm = NOROWPERM;  						  
 #if 0
-    options.RowPerm = LargeDiag_HWPM;
+    options.RowPerm           = LargeDiag_HWPM;
+    options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
@@ -232,10 +233,16 @@ int main(int argc, char *argv[])
     pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-
-    /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		         nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
@@ -248,9 +255,7 @@ int main(int argc, char *argv[])
     dScalePermstructFree(&ScalePermstruct);
     dDestroy_LU(n, &grid, &LUstruct);
     dLUstructFree(&LUstruct);
-    if ( options.SolveInitialized ) {
-        dSolveFinalize(&options, &SOLVEstruct);
-    }
+    dSolveFinalize(&options, &SOLVEstruct);
     SUPERLU_FREE(b);
     SUPERLU_FREE(xtrue);
     SUPERLU_FREE(berr);
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 8e5396a9..f058bd40 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -14,10 +14,11 @@ at the top-level directory.
  * \brief Driver program for PDGSSVX example
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * </pre>
  */
 
@@ -33,7 +34,8 @@ at the top-level directory.
  * The driver program PDDRIVE1.
  *
  * This example illustrates how to use PDGSSVX to
- * solve systems with the same A but different right-hand side.
+ * solve systems with the same A but different right-hand side,
+ * possibly with different number of right-hand sides.
  * In this case, we factorize A only once in the first call to
  * PDGSSVX, and reuse the following data structures
  * in the subsequent call to PDGSSVX:
@@ -54,8 +56,8 @@ int main(int argc, char *argv[])
     dSOLVEstruct_t SOLVEstruct;
     gridinfo_t grid;
     double   *berr;
-    double   *b, *xtrue, *b1;
-    int    i, j, m, n;
+    double   *b, *xtrue, *b1, *b2;
+    int    i, j, m, n, m_loc;
     int    nprow, npcol;
     int    iam, info, ldb, ldx, nrhs;
     char     **cpp, c, *postfix;
@@ -65,7 +67,7 @@ int main(int argc, char *argv[])
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
-    nrhs = 1;   /* Number of right-hand side. */
+    nrhs  = 3;  /* Max. number of right-hand sides. */
 
     /* ------------------------------------------------------------
        INITIALIZE MPI ENVIRONMENT. 
@@ -104,7 +106,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -141,14 +143,24 @@ int main(int argc, char *argv[])
     dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
     if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) )
         ABORT("Malloc fails for b1[]");
-    for (j = 0; j < nrhs; ++j)
-        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
 
     if ( !(berr = doubleMalloc_dist(nrhs)) )
 	ABORT("Malloc fails for berr[].");
 
+    m = A.nrow;
+    n = A.ncol;
+    m_loc = ((NRformat_loc *)A.Store)->m_loc;
+    
     /* ------------------------------------------------------------
-       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
        ------------------------------------------------------------*/
 
     /* Set the default input options:
@@ -171,9 +183,6 @@ int main(int argc, char *argv[])
 	fflush(stdout);
     }
 
-    m = A.nrow;
-    n = A.ncol;
-
     /* Initialize ScalePermstruct and LUstruct. */
     dScalePermstructInit(m, n, &ScalePermstruct);
     dLUstructInit(n, &LUstruct);
@@ -182,41 +191,90 @@ int main(int argc, char *argv[])
     PStatInit(&stat);
 
     /* Call the linear equation solver. */
+    nrhs = 1;
     pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("\tSolve the first system:\n");
-    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
 
     /* ------------------------------------------------------------
-       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
        RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
        LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
        ------------------------------------------------------------*/
     options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
     PStatInit(&stat); /* Initialize the statistics variables. */
 
+    nrhs = 1;
     pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("\tSolve the system with a different B:\n");
-    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b1, ldb, xtrue, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {    
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
 
+    /* ------------------------------------------------------------
+       3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       NUMBER OF RIGHT-HAND SIDES,  WE WILL USE THE EXISTING L AND U
+       FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 3;
+    
+    /* When changing the number of RHS's, the following counters 
+       for communication messages must be reset. */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm;
+    SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
+    SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
+    SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
+    pdgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row,
+		 ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid,
+		 LUstruct.Glu_persist, &SOLVEstruct);
+    
+    pdgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with 3 RHS's:\n");
+        pdinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
 
     /* ------------------------------------------------------------
        DEALLOCATE STORAGE.
        ------------------------------------------------------------*/
-    PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A);
     dScalePermstructFree(&ScalePermstruct);   
     dDestroy_LU(n, &grid, &LUstruct);
@@ -226,6 +284,7 @@ int main(int argc, char *argv[])
     }
     SUPERLU_FREE(b);
     SUPERLU_FREE(b1);
+    SUPERLU_FREE(b2);
     SUPERLU_FREE(xtrue);
     SUPERLU_FREE(berr);
     fclose(fp);
diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c
index ad74edc6..7686b79c 100644
--- a/EXAMPLE/pddrive1_ABglobal.c
+++ b/EXAMPLE/pddrive1_ABglobal.c
@@ -72,6 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
 
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
@@ -105,8 +111,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index 5ad5a3ae..6bdc7007 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -33,8 +33,8 @@ at the top-level directory.
  *
  * The driver program PDDRIVE2.
  *
- * This example illustrates how to use  to solve
- * systems repeatedly with the same sparsity pattern of matrix A.
+ * This example illustrates how to use PDGSSVX to solve systems
+ * repeatedly with the same sparsity pattern of matrix A.
  * In this case, the column permutation vector ScalePermstruct->perm_c is
  * computed once. The following data structures will be reused in the
  * subsequent call to PDGSSVX:
@@ -116,7 +116,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -147,7 +147,8 @@ int main(int argc, char *argv[])
        GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
        ------------------------------------------------------------*/
     dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
-
+    fclose(fp);
+    
     if ( !(berr = doubleMalloc_dist(nrhs)) )
 	ABORT("Malloc fails for berr[].");
     m = A.nrow;
@@ -156,7 +157,7 @@ int main(int argc, char *argv[])
     m_loc = Astore->m_loc;
 
     /* ------------------------------------------------------------
-       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
        ------------------------------------------------------------*/
 
     /* Set the default input options:
@@ -190,20 +191,27 @@ int main(int argc, char *argv[])
     pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */ 
     dDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
-					the L and U matrices.               */
-    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
-    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+					the L and U matrices.  */
+    SUPERLU_FREE(b);      /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);  /* Free storage of the exact solution.*/
 
     /* ------------------------------------------------------------
-       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
-       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       	  ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
        ------------------------------------------------------------*/
     options.Fact = SamePattern;
 
@@ -218,18 +226,25 @@ int main(int argc, char *argv[])
     /* Get the matrix from file, perturbed some diagonal entries to force
        a different perm_r[]. Set up the right-hand side.   */
     if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
-    dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid);
-
+    dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb,
+                                  &xtrue1, &ldx, fp, postfix, &grid);
+			     
     PStatInit(&stat); /* Initialize the statistics variables. */
 
     /* Solve the linear system. */
     pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
-    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
 #if ( PRNTlevel>=2 )
     if (iam==0) {
 	PrintInt10("new perm_r", m, ScalePermstruct.perm_r);
diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c
index fad13d43..e908a6ca 100644
--- a/EXAMPLE/pddrive2_ABglobal.c
+++ b/EXAMPLE/pddrive2_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -105,8 +110,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index 4287cae0..d3d6683c 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -35,9 +35,9 @@ at the top-level directory.
  * This example illustrates how to use PDGSSVX to solve
  * systems repeatedly with the same sparsity pattern and similar
  * numerical values of matrix A.
- * In this case, the column permutation vector and symbolic factorization are
- * computed only once. The following data structures will be reused in the
- * subsequent call to PDGSSVX:
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PDGSSVX:
  *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
  *        LUstruct        : etree, Glu_persist, Llu
  *
@@ -113,7 +113,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -206,8 +206,15 @@ int main(int argc, char *argv[])
     pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -231,8 +238,9 @@ int main(int argc, char *argv[])
         nzval1[0] += 1.0e-8;
     }
 
-    /* Zero the numerical values in L.  */
+    /* Zero the numerical values in L and U.  */
     dZeroLblocks(iam, n, &grid, &LUstruct);
+    dZeroUblocks(iam, n, &grid, &LUstruct);
 
     dCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
 				   nzval1, colind1, rowptr1,
@@ -242,16 +250,23 @@ int main(int argc, char *argv[])
     pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam )
-        printf("Solve a system with the same pattern and similar values.\n");
-    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam )
+            printf("Solve a system with the same pattern and similar values.\n");
+        pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
 
     /* Print the statistics. */
     PStatPrint(&options, &stat, &grid);
 
     /* ------------------------------------------------------------
-       DEALLOCATE STORAGE.
+       DEALLOCATE ALL STORAGE.
        ------------------------------------------------------------*/
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c
index 775dc5a7..e20c664d 100644
--- a/EXAMPLE/pddrive3_ABglobal.c
+++ b/EXAMPLE/pddrive3_ABglobal.c
@@ -78,7 +78,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -111,8 +116,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
new file mode 100644
index 00000000..02f4837c
--- /dev/null
+++ b/EXAMPLE/pddrive3d.c
@@ -0,0 +1,420 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D.
+ *
+ * This example illustrates how to use PDGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np <p> pddrive3d -r <proc rows> -c <proc columns> \
+ *                                   -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       d must be a power-of-two, e.g., 1, 2, 4, ...
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *xtrue;
+    int_t m, n;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+#define NRFRMT
+#ifndef NRFRMT
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+	dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
+	
+#else
+    // *fp0 = *fp;
+    dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    double* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    double *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+    #endif
+#endif
+
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------ */
+
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+#ifdef NRFRMT  // matrix is on 3D process grid
+    m = A.nrow;
+    n = A.ncol;
+#else
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
+    {
+	m = A.nrow;
+        n = A.ncol;
+    }
+    // broadcast m, n to all the process layers;
+    MPI_Bcast( &m, 1, mpi_int_t, 0,  grid.zscp.comm);
+    MPI_Bcast( &n, 1, mpi_int_t, 0,  grid.zscp.comm);
+#endif    
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        dSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+    PStatFree (&stat);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c
new file mode 100644
index 00000000..b1a553a4
--- /dev/null
+++ b/EXAMPLE/pddrive3d1.c
@@ -0,0 +1,448 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D1.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PDGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PDGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> pddrive3d1 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *xtrue, *b1, *b2;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    double* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    double *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+#endif
+
+    /* Save two copies of the RHS */
+    if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    }
+    PStatFree (&stat);
+    fflush(stdout);
+
+   /* ------------------------------------------------------------
+     2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+     RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+     LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+     ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+            printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+            fflush(stdout);
+        }
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                            nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        dSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (b2);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive3d2.c b/EXAMPLE/pddrive3d2.c
new file mode 100644
index 00000000..5fed6157
--- /dev/null
+++ b/EXAMPLE/pddrive3d2.c
@@ -0,0 +1,424 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D2.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PDGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> pddrive3d2 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    /* prototypes */
+    extern int dcreate_matrix_perturbed
+        (SuperMatrix *, int, double **, int *, double **, int *,
+         FILE *, gridinfo_t *);
+    extern int dcreate_matrix_perturbed_postfix
+        (SuperMatrix *, int, double **, int *, double **, int *,
+         FILE *, char *, gridinfo_t *);
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, keep around 2D matrix meta structure */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+        /* Deallocate storage associated with the L and U matrices.*/
+	dDestroy_LU(n, &(grid.grid2d), &LUstruct);
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                         &xtrue1, &ldx, fp, suffix, &(grid));
+    
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+ 
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	   printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+    	   fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        dSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization
+
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive3d3.c b/EXAMPLE/pddrive3d3.c
new file mode 100644
index 00000000..3a7ccb59
--- /dev/null
+++ b/EXAMPLE/pddrive3d3.c
@@ -0,0 +1,430 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D3.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PDGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PDGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np <p> pddrive3d3 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if (grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, including replicated LU structure along
+       the Z dimension. keep around 2D matrix meta structure, including
+       the LU data structure on the host side.  */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    
+    if ( (grid.zscp).Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    
+    /* Zero the numerical values in L and U.  */
+    if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */
+        dZeroLblocks(iam, n, &(grid.grid2d), &LUstruct);
+        dZeroUblocks(iam, n, &(grid.grid2d), &LUstruct);
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                             &xtrue1, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve a system with the same pattern and similar values.\n");
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE ALL STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        dSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
+    }
+    
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index d7289de5..6cbac44f 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -61,7 +61,7 @@ int main(int argc, char *argv[])
     int_t    *asub, *xa;
     int_t    i, j, m, n;
     int      nprow, npcol, ldumap, p;
-    int_t    usermap[6];
+    int    usermap[6];
     int      iam, info, ldb, ldx, nprocs;
     int      nrhs = 1;   /* Number of right-hand side. */
     int ii, omp_mpi_level;
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
@@ -191,9 +191,16 @@ int main(int argc, char *argv[])
 	pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
                 &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-        /* Check the accuracy of the solution. */
-        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid1);
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+    	        fflush(stdout);
+    	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid1.comm);
+	}
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid1);
@@ -256,10 +263,17 @@ int main(int argc, char *argv[])
 	pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
                 &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-        /* Check the accuracy of the solution. */
-        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid2);
-    
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	        fflush(stdout);
+	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid2.comm);
+        }
+	
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid2);
 
diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c
index f870de85..2cf76078 100644
--- a/EXAMPLE/pddrive4_ABglobal.c
+++ b/EXAMPLE/pddrive4_ABglobal.c
@@ -60,7 +60,7 @@ int main(int argc, char *argv[])
     int_t    *asub, *xa;
     int_t    i, j, m, n, nnz;
     int_t    nprow, npcol, ldumap, p;
-    int_t    usermap[6];
+    int    usermap[6];
     int      iam, info, ldb, ldx, nprocs;
     int      nrhs = 1;   /* Number of right-hand side. */
     char     trans[1];
@@ -71,6 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
@@ -126,7 +132,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c
index d6719e41..3541ab92 100644
--- a/EXAMPLE/pddrive_ABglobal.c
+++ b/EXAMPLE/pddrive_ABglobal.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -106,8 +111,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
 
 #if ( VAMPIR>=1 )
     VT_traceoff();
diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c
index 47b04729..b119b46e 100755
--- a/EXAMPLE/pddrive_spawn.c
+++ b/EXAMPLE/pddrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
@@ -151,7 +156,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -247,7 +252,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
@@ -265,7 +270,7 @@ int main(int argc, char *argv[])
 		result[1] = total * 1e-6;     
 		if (!iam) {
 			printf("returning data:\n"
-		   "    Factor time :        %8.2f |  Total MEM : %8.2f\n",
+		   "    Factor time :        %8.2f\n    Total MEM : %8.2f\n",
 		   stat.utime[FACT], total * 1e-6);
 			printf("**************************************************\n");
 			fflush(stdout);
@@ -302,6 +307,7 @@ int main(int argc, char *argv[])
        RELEASE THE SUPERLU PROCESS GRID.
        ------------------------------------------------------------*/
 out:
+if(parent!=MPI_COMM_NULL)
 	MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent);
     superlu_gridexit(&grid);
 
@@ -309,7 +315,7 @@ int main(int argc, char *argv[])
        TERMINATES THE MPI EXECUTION ENVIRONMENT.
        ------------------------------------------------------------*/
 	   
-	
+	if(parent!=MPI_COMM_NULL)
 	MPI_Comm_disconnect(&parent);
     MPI_Finalize();
 
diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c
new file mode 100644
index 00000000..72db7f96
--- /dev/null
+++ b/EXAMPLE/psdrive.c
@@ -0,0 +1,291 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PSGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * December 6, 2018
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE.
+ *
+ * This example illustrates how to use PSGSSVX with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> psdrive -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *b, *xtrue;
+    int    m, n;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c, *postfix;;
+    FILE *fp, *fopen();
+    int cpp_defs();
+    int ii, omp_mpi_level;
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    //MPI_Init( &argc, &argv );
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
+	
+
+#if ( VAMPIR>=1 )
+    VT_traceoff(); 
+#endif
+
+#if ( VTUNE>=1 )
+	__itt_pause();
+#endif
+	
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+	
+    if(grid.iam==0){
+	MPI_Query_thread(&omp_mpi_level);
+    switch (omp_mpi_level) {
+      case MPI_THREAD_SINGLE:
+		printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+		fflush(stdout);
+	break;
+      case MPI_THREAD_FUNNELED:
+		printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+		fflush(stdout);
+	break;
+      case MPI_THREAD_SERIALIZED:
+		printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+		fflush(stdout);
+	break;
+      case MPI_THREAD_MULTIPLE:
+		printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+		fflush(stdout);
+	break;
+    }
+	}
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( (iam >= nprow * npcol) || (iam == -1) ) goto out;
+    if ( !iam ) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    for(ii = 0;ii<strlen(*cpp);ii++){
+	if((*cpp)[ii]=='.'){
+		postfix = &((*cpp)[ii+1]);
+	}
+    }
+    // printf("%s\n", postfix);
+	
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
+
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact              = DOFACT;
+        options.Equil             = YES;
+        options.ParSymbFact       = NO;
+        options.ColPerm           = METIS_AT_PLUS_A;
+        options.RowPerm           = LargeDiag_MC64;
+        options.ReplaceTinyPivot  = NO;
+        options.IterRefine        = DOUBLE;
+        options.Trans             = NOTRANS;
+        options.SolveInitialized  = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat         = YES;
+	options.DiagInv           = NO;
+     */
+    set_default_options_dist(&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO; 
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		         nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    sScalePermstructFree(&ScalePermstruct);
+    sDestroy_LU(n, &grid, &LUstruct);
+    sLUstructFree(&LUstruct);
+    sSolveFinalize(&options, &SOLVEstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive1.c b/EXAMPLE/psdrive1.c
new file mode 100644
index 00000000..ac8f1d75
--- /dev/null
+++ b/EXAMPLE/psdrive1.c
@@ -0,0 +1,327 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PSGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * January 4 2020
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE1.
+ *
+ * This example illustrates how to use PSGSSVX to
+ * solve systems with the same A but different right-hand side,
+ * possibly with different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to
+ * PSGSSVX, and reuse the following data structures
+ * in the subsequent call to PSGSSVX:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> psdrive1 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *b, *xtrue, *b1, *b2;
+    int    i, j, m, n, m_loc;
+    int    nprow, npcol;
+    int    iam, info, ldb, ldx, nrhs;
+    char     **cpp, c, *postfix;
+    int ii, omp_mpi_level;
+    FILE *fp, *fopen();
+    int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs  = 3;  /* Max. number of right-hand sides. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+    if ( !iam ) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( VAMPIR>=1 )
+    VT_traceoff();
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    for(ii = 0;ii<strlen(*cpp);ii++){
+	if((*cpp)[ii]=='.'){
+		postfix = &((*cpp)[ii+1]);
+	}
+    }	
+    // printf("%s\n", postfix);
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
+    if ( !(b1 = floatMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    if ( !(b2 = floatMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
+
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    m = A.nrow;
+    n = A.ncol;
+    m_loc = ((NRformat_loc *)A.Store)->m_loc;
+    
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = NO;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    nrhs = 1;
+    psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {    
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       NUMBER OF RIGHT-HAND SIDES,  WE WILL USE THE EXISTING L AND U
+       FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 3;
+    
+    /* When changing the number of RHS's, the following counters 
+       for communication messages must be reset. */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm;
+    SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
+    SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
+    SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
+    psgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row,
+		 ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid,
+		 LUstruct.Glu_persist, &SOLVEstruct);
+    
+    psgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with 3 RHS's:\n");
+        psinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    Destroy_CompRowLoc_Matrix_dist(&A);
+    sScalePermstructFree(&ScalePermstruct);   
+    sDestroy_LU(n, &grid, &LUstruct);
+    sLUstructFree(&LUstruct);
+    if ( options.SolveInitialized ) {
+        sSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(b2);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive1_ABglobal.c b/EXAMPLE/psdrive1_ABglobal.c
new file mode 100644
index 00000000..2ae6fedb
--- /dev/null
+++ b/EXAMPLE/psdrive1_ABglobal.c
@@ -0,0 +1,281 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for psgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive1_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * psgssvx_ABglobal, and reuse the following data structures
+ * in the subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive1_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *a, *b, *b1, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )
+	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]");
+    if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    if ( !(xtrue = floatMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    sGenXtrue_dist(n, nrhs, xtrue, ldx);
+    sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with a different B.\n");
+	sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    sDestroy_LU(n, &grid, &LUstruct);
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(b1);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c
new file mode 100644
index 00000000..ffc93170
--- /dev/null
+++ b/EXAMPLE/psdrive2.c
@@ -0,0 +1,311 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PSGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * December 31, 2016 version 5.1.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE2.
+ *
+ * This example illustrates how to use PSGSSVX to solve systems
+ * repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> psdrive2 -r <proc rows> -c <proc columns> g20.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *b, *b1, *xtrue, *xtrue1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c, *postfix;
+    int ii, omp_mpi_level;
+    FILE *fp, *fopen();
+    int cpp_defs();
+
+    /* prototypes */
+    extern int screate_matrix_perturbed
+        (SuperMatrix *, int, float **, int *, float **, int *,
+         FILE *, gridinfo_t *);
+    extern int screate_matrix_perturbed_postfix
+        (SuperMatrix *, int, float **, int *, float **, int *,
+         FILE *, char *, gridinfo_t *);
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %4d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %4d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+    if ( !iam ) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    for(ii = 0;ii<strlen(*cpp);ii++){
+	if((*cpp)[ii]=='.'){
+	    postfix = &((*cpp)[ii+1]);
+	}
+    }	
+    // printf("%s\n", postfix);
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
+       ------------------------------------------------------------*/
+    screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
+    fclose(fp);
+    
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+    Astore = (NRformat_loc *) A.Store;
+    m_loc = Astore->m_loc;
+
+    /* ------------------------------------------------------------
+       1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = NO;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */ 
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.  */
+    SUPERLU_FREE(b);      /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);  /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       	  ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+
+    if (iam==0) {
+	print_options_dist(&options);
+#if ( PRNTlevel>=2 )
+	PrintInt10("perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("perm_c", n, ScalePermstruct.perm_c);
+#endif
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    screate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb,
+                                  &xtrue1, &ldx, fp, postfix, &grid);
+			     
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Solve the linear system. */
+    psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+#if ( PRNTlevel>=2 )
+    if (iam==0) {
+	PrintInt10("new perm_r", m, ScalePermstruct.perm_r);
+	PrintInt10("new perm_c", n, ScalePermstruct.perm_c);
+    }
+#endif
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        sSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue1);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
+
+
diff --git a/EXAMPLE/psdrive2_ABglobal.c b/EXAMPLE/psdrive2_ABglobal.c
new file mode 100644
index 00000000..fe4e5a29
--- /dev/null
+++ b/EXAMPLE/psdrive2_ABglobal.c
@@ -0,0 +1,298 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for psgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive2_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once.  The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive2_ABglobal -r <proc rows> -c <proc columns> <input_matrix> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       Process 0 reads the matrix A, and then broadcasts it to all
+       the other processes.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=floatMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    sGenXtrue_dist(n, nrhs, xtrue, ldx);
+    sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    sallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
+					the L and U matrices.               */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Solve the linear system. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve the system with the same sparsity pattern.\n");
+	sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c
new file mode 100644
index 00000000..eaa2e989
--- /dev/null
+++ b/EXAMPLE/psdrive3.c
@@ -0,0 +1,319 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PSGSSVX example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3.
+ *
+ * This example illustrates how to use PSGSSVX to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PSGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX, and reused in the subsequent call.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n <np> psdrive3 -r <proc rows> -c <proc columns> big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    NRformat_loc *Astore;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *b, *b1, *xtrue, *nzval, *nzval1;
+    int_t    *colind, *colind1, *rowptr, *rowptr1;
+    int_t    i, j, m, n, nnz_loc, m_loc, fst_row;
+    int      nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     **cpp, c, *postfix;
+    int ii, omp_mpi_level;
+    FILE *fp, *fopen();
+    int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+    if ( !iam ) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+        printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    for(ii = 0;ii<strlen(*cpp);ii++){
+	if((*cpp)[ii]=='.'){
+  	    postfix = &((*cpp)[ii+1]);
+	}
+    }	
+    // printf("%s\n", postfix);
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       ------------------------------------------------------------*/
+    screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
+
+    if ( !(b1 = floatMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Save a copy of the matrix A. */
+    Astore = (NRformat_loc *) A.Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nzval = Astore->nzval;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval1 = floatMalloc_dist(nnz_loc);
+    colind1 = intMalloc_dist(nnz_loc);
+    rowptr1 = intMalloc_dist(m_loc+1);
+    for (i = 0; i < nnz_loc; ++i) {
+        nzval1[i] = nzval[i];
+        colind1[i] = colind[i];
+    }
+    for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = NO;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Set up the local A in NR_loc format */
+
+    /* Perturb the 1st diagonal of the matrix to larger value.
+       Intention is to change values of A.   */
+    if (iam == 0) {
+    }
+
+    /* Zero the numerical values in L and U.  */
+    sZeroLblocks(iam, n, &grid, &LUstruct);
+    sZeroUblocks(iam, n, &grid, &LUstruct);
+
+    sCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval1, colind1, rowptr1,
+				   SLU_NR_loc, SLU_S, SLU_GE);
+
+    /* Solve the linear system. */
+    psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+            &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam )
+            printf("Solve a system with the same pattern and similar values.\n");
+        psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE ALL STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    if ( options.SolveInitialized ) {
+        sSolveFinalize(&options, &SOLVEstruct);
+    }
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3_ABglobal.c b/EXAMPLE/psdrive3_ABglobal.c
new file mode 100644
index 00000000..ee06fa2c
--- /dev/null
+++ b/EXAMPLE/psdrive3_ABglobal.c
@@ -0,0 +1,305 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for psgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive3A_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to psgssvx_ABglobal, and reused in the subsequent call.
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive3_ABglobal -r <proc rows> -c <proc columns> <input_matrix>  -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *a, *a1, *b, *b1, *xtrue;
+    int_t    *asub, *asub1, *xa, *xa1;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    sGenXtrue_dist(n, nrhs, xtrue, ldx);
+    sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    /* Save a copy of the right-hand side. */  
+    if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* Save a copy of the matrix A. */
+    sallocateA_dist(n, nnz, &a1, &asub1, &xa1);
+    for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; }
+    for (i = 0; i < n+1; ++i) xa1[i] = xa[i];
+
+
+    /* ------------------------------------------------------------
+       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver: factorize and solve. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
+
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR
+       NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Solve the linear system. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	printf("Solve a system with the same pattern and similar values.\n");
+	sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid);
+    }
+
+    /* Print the statistics. */
+    PStatPrint(&options, &stat, &grid);
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A.     */
+    sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with    
+					the L and U matrices.               */
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);         /* Deallocate the structure of L and U.*/
+    SUPERLU_FREE(b1);	             /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3d.c b/EXAMPLE/psdrive3d.c
new file mode 100644
index 00000000..6cbc21e4
--- /dev/null
+++ b/EXAMPLE/psdrive3d.c
@@ -0,0 +1,420 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PSGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D.
+ *
+ * This example illustrates how to use PSGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np <p> psdrive3d -r <proc rows> -c <proc columns> \
+ *                                   -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       d must be a power-of-two, e.g., 1, 2, 4, ...
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, float* A, int LDA,
+       float* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    float * Aval = (float *) A->nzval;
+    float * Bval = (float *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    float *berr;
+    float *b, *xtrue;
+    int_t m, n;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+#define NRFRMT
+#ifndef NRFRMT
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+	screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
+	
+#else
+    // *fp0 = *fp;
+    screate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    float* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    float *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        screate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+    #endif
+#endif
+
+    if (!(berr = floatMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------ */
+
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+#ifdef NRFRMT  // matrix is on 3D process grid
+    m = A.nrow;
+    n = A.ncol;
+#else
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
+    {
+	m = A.nrow;
+        n = A.ncol;
+    }
+    // broadcast m, n to all the process layers;
+    MPI_Bcast( &m, 1, mpi_int_t, 0,  grid.zscp.comm);
+    MPI_Bcast( &n, 1, mpi_int_t, 0,  grid.zscp.comm);
+#endif    
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit (m, n, &ScalePermstruct);
+    sLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        sDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        sSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    sScalePermstructFree (&ScalePermstruct);
+    sLUstructFree (&LUstruct);
+    PStatFree (&stat);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c
new file mode 100644
index 00000000..c206d2fe
--- /dev/null
+++ b/EXAMPLE/psdrive3d1.c
@@ -0,0 +1,448 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PSGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D1.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PSGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PSGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> psdrive3d1 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, float* A, int LDA,
+       float* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    float * Aval = (float *) A->nzval;
+    float * Bval = (float *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    float *berr;
+    float *b, *xtrue, *b1, *b2;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    screate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    float* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    float *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        screate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+#endif
+
+    /* Save two copies of the RHS */
+    if ( !(b1 = floatMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    if ( !(b2 = floatMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
+    
+    if (!(berr = floatMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit (m, n, &ScalePermstruct);
+    sLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    }
+    PStatFree (&stat);
+    fflush(stdout);
+
+   /* ------------------------------------------------------------
+     2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+     RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+     LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+     ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+            printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+            fflush(stdout);
+        }
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                            nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        sDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        sSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (b2);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    sScalePermstructFree (&ScalePermstruct);
+    sLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3d2.c b/EXAMPLE/psdrive3d2.c
new file mode 100644
index 00000000..d6e0adef
--- /dev/null
+++ b/EXAMPLE/psdrive3d2.c
@@ -0,0 +1,424 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PSGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D2.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> psdrive3d2 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, float* A, int LDA,
+       float* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    float * Aval = (float *) A->nzval;
+    float * Bval = (float *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    float *berr;
+    float *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    /* prototypes */
+    extern int screate_matrix_perturbed
+        (SuperMatrix *, int, float **, int *, float **, int *,
+         FILE *, gridinfo_t *);
+    extern int screate_matrix_perturbed_postfix
+        (SuperMatrix *, int, float **, int *, float **, int *,
+         FILE *, char *, gridinfo_t *);
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    screate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = floatMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit (m, n, &ScalePermstruct);
+    sLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, keep around 2D matrix meta structure */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+        /* Deallocate storage associated with the L and U matrices.*/
+	sDestroy_LU(n, &(grid.grid2d), &LUstruct);
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    screate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                         &xtrue1, &ldx, fp, suffix, &(grid));
+    
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+ 
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	   printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+    	   fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        sDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        sSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization
+
+    sScalePermstructFree (&ScalePermstruct);
+    sLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive3d3.c b/EXAMPLE/psdrive3d3.c
new file mode 100644
index 00000000..89095fc5
--- /dev/null
+++ b/EXAMPLE/psdrive3d3.c
@@ -0,0 +1,430 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PSGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D3.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PSGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np <p> psdrive3d3 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, float* A, int LDA,
+       float* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    float * Aval = (float *) A->nzval;
+    float * Bval = (float *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    float *berr;
+    float *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if (grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    screate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = floatMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit (m, n, &ScalePermstruct);
+    sLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, including replicated LU structure along
+       the Z dimension. keep around 2D matrix meta structure, including
+       the LU data structure on the host side.  */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    
+    if ( (grid.zscp).Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    
+    /* Zero the numerical values in L and U.  */
+    if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */
+        sZeroLblocks(iam, n, &(grid.grid2d), &LUstruct);
+        sZeroUblocks(iam, n, &(grid.grid2d), &LUstruct);
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    screate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                             &xtrue1, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from psgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve a system with the same pattern and similar values.\n");
+        psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE ALL STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        sDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        sSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        sDeAllocLlu_3d(n, &LUstruct, &grid);
+        sDeAllocGlu_3d(&LUstruct);
+    }
+    
+    sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    sScalePermstructFree (&ScalePermstruct);
+    sLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/psdrive4.c b/EXAMPLE/psdrive4.c
new file mode 100644
index 00000000..55057ca2
--- /dev/null
+++ b/EXAMPLE/psdrive4.c
@@ -0,0 +1,315 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE4.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n 10 psdrive4 big.rua
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    sSOLVEstruct_t SOLVEstruct;
+    gridinfo_t grid1, grid2;
+    float   *berr;
+    float   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n;
+    int      nprow, npcol, ldumap, p;
+    int    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    int ii, omp_mpi_level;
+    char     **cpp, c, *postfix;
+    FILE *fp, *fopen();
+    int cpp_defs();
+
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
+
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
+		  printf("\t-c <int>: process columns (default %d)\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam == -1 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    for(ii = 0;ii<strlen(*cpp);ii++){
+	if((*cpp)[ii]=='.'){
+	    postfix = &((*cpp)[ii+1]);
+	}
+    }
+    // printf("%s\n", postfix);
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1);
+	
+	if ( !(berr = floatMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag_MC64;
+            options.ReplaceTinyPivot = NO;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+    	    print_options_dist(&options);
+        }
+
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	sScalePermstructInit(m, n, &ScalePermstruct);
+	sLUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+    	        fflush(stdout);
+    	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid1.comm);
+	}
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        sScalePermstructFree(&ScalePermstruct);
+	sDestroy_LU(n, &grid1, &LUstruct);
+	sLUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            sSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+        /* ------------------------------------------------------------
+           GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+           ------------------------------------------------------------*/
+        screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2);
+
+	if ( !(berr = floatMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag_MC64;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+        m = A.nrow;
+        n = A.ncol;
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	sScalePermstructInit(m, n, &ScalePermstruct);
+	sLUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver. */
+	psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+                &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from psgssvx()\n", info);
+	        fflush(stdout);
+	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid2.comm);
+        }
+	
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+        Destroy_CompRowLoc_Matrix_dist(&A);
+        sScalePermstructFree(&ScalePermstruct);
+	sDestroy_LU(n, &grid2, &LUstruct);
+	sLUstructFree(&LUstruct);
+        if ( options.SolveInitialized ) {
+            sSolveFinalize(&options, &SOLVEstruct);
+        }
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    fclose(fp);
+
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/psdrive4_ABglobal.c b/EXAMPLE/psdrive4_ABglobal.c
new file mode 100644
index 00000000..44736a07
--- /dev/null
+++ b/EXAMPLE/psdrive4_ABglobal.c
@@ -0,0 +1,361 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief This example illustrates how to divide up the processes into subgroups
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive4_ABglobal.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive4_ABglobal <input_file> -procs 10
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    gridinfo_t grid1, grid2;
+    float   *berr;
+    float   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    i, j, m, n, nnz;
+    int_t    nprow, npcol, ldumap, p;
+    int    usermap[6];
+    int      iam, info, ldb, ldx, nprocs;
+    int      nrhs = 1;   /* Number of right-hand side. */
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
+    if ( nprocs < 10 ) {
+	fprintf(stderr, "Requires at least 10 processes\n");
+	exit(-1);
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 1. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 3;
+    ldumap = 2;
+    p = 0;    /* Grid 1 starts from process 0. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1);
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID 2. 
+       ------------------------------------------------------------*/
+    nprow = 2;
+    npcol = 2;
+    ldumap = 2;
+    p = 6;   /* Grid 2 starts from process 6. */
+    for (i = 0; i < nprow; ++i)
+	for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
+    superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2);
+
+    /* Bail out if I do not belong in any of the 2 grids. */
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    if ( iam == -1 ) goto out;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+
+    if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */
+	iam = grid1.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid1.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    sallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid1.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid1.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid1.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_S, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	sGenXtrue_dist(n, nrhs, xtrue, ldx);
+	sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = floatMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = METIS_AT_PLUS_A;
+            options.RowPerm = LargeDiag_MC64;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+
+        if (!iam) {
+	    print_sp_ienv_dist(&options);
+	    print_options_dist(&options);
+        }
+
+	/* Initialize ScalePermstruct and LUstruct. */
+	sScalePermstructInit(m, n, &ScalePermstruct);
+	sLUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid1);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	sDestroy_LU(n, &grid1, &LUstruct);
+	sScalePermstructFree(&ScalePermstruct);
+	sLUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+
+    } else { /* I am in grid 2. */
+	iam = grid2.iam;  /* Get the logical number in the new grid. */
+
+	/* ------------------------------------------------------------
+	   PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+	   THE OTHER PROCESSES.
+	   ------------------------------------------------------------*/
+	if ( !iam ) {
+	    /* Read the matrix stored on disk in Harwell-Boeing format. */
+	    sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	    printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	    printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol);
+
+	    /* Broadcast matrix A to the other PEs. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	} else {
+	    /* Receive matrix A from PE 0. */
+	    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid2.comm );
+
+	    /* Allocate storage for compressed column representation. */
+	    sallocateA_dist(n, nnz, &a, &asub, &xa);
+	    
+	    MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid2.comm );
+	    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid2.comm );
+	    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid2.comm );
+	}
+	
+	/* Create compressed column matrix for A. */
+	sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				    SLU_NC, SLU_S, SLU_GE);
+
+	/* Generate the exact solution and compute the right-hand side. */
+	if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+	if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+	*trans = 'N';
+	ldx = n;
+	ldb = m;
+	sGenXtrue_dist(n, nrhs, xtrue, ldx);
+	sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+	if ( !(berr = floatMalloc_dist(nrhs)) )
+	    ABORT("Malloc fails for berr[].");
+
+	/* ------------------------------------------------------------
+	   NOW WE SOLVE THE LINEAR SYSTEM.
+	   ------------------------------------------------------------*/
+	
+        /* Set the default input options:
+            options.Fact = DOFACT;
+            options.Equil = YES;
+            options.ColPerm = MMD_AT_PLUS_A;
+            options.RowPerm = LargeDiag_MC64;
+            options.ReplaceTinyPivot = YES;
+            options.Trans = NOTRANS;
+            options.IterRefine = DOUBLE;
+            options.SolveInitialized = NO;
+            options.RefineInitialized = NO;
+            options.PrintStat = YES;
+         */
+	set_default_options_dist(&options);
+	
+	/* Initialize ScalePermstruct and LUstruct. */
+	sScalePermstructInit(m, n, &ScalePermstruct);
+	sLUstructInit(n, &LUstruct);
+
+	/* Initialize the statistics variables. */
+	PStatInit(&stat);
+	
+	/* Call the linear equation solver: factorize and solve. */
+	psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
+			 &LUstruct, berr, &stat, &info);
+
+	/* Check the accuracy of the solution. */
+	if ( !iam ) {
+	    sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2);
+	}
+    
+    
+	/* Print the statistics. */
+	PStatPrint(&options, &stat, &grid2);
+
+	/* ------------------------------------------------------------
+	   DEALLOCATE STORAGE.
+	   ------------------------------------------------------------*/
+	PStatFree(&stat);
+	Destroy_CompCol_Matrix_dist(&A); 
+	sDestroy_LU(n, &grid2, &LUstruct);
+	sScalePermstructFree(&ScalePermstruct);
+	sLUstructFree(&LUstruct);
+	SUPERLU_FREE(b);
+	SUPERLU_FREE(xtrue);
+	SUPERLU_FREE(berr);
+    }
+
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRIDS.
+       ------------------------------------------------------------*/
+    superlu_gridexit(&grid1);
+    superlu_gridexit(&grid2);
+
+out:
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
diff --git a/EXAMPLE/psdrive_ABglobal.c b/EXAMPLE/psdrive_ABglobal.c
new file mode 100644
index 00000000..7d7cdc91
--- /dev/null
+++ b/EXAMPLE/psdrive_ABglobal.c
@@ -0,0 +1,256 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for psgssvx_ABglobal example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx_ABglobal
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive_ABglobal -r <proc rows> -c <proc columns> <input_file> -procs <p>
+ * </pre>
+ */
+
+int main(int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;
+    sScalePermstruct_t ScalePermstruct;
+    sLUstruct_t LUstruct;
+    gridinfo_t grid;
+    float   *berr;
+    float   *a, *b, *xtrue;
+    int_t    *asub, *xa;
+    int_t    m, n, nnz;
+    int_t    nprow, npcol;
+    int      iam, info, ldb, ldx, nrhs;
+    char     trans[1];
+    char     **cpp, c;
+    FILE *fp, *fopen();
+    extern int cpp_defs();
+
+    nprow = 1;  /* Default process rows.      */
+    npcol = 1;  /* Default process columns.   */
+    nrhs = 1;   /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT. 
+       ------------------------------------------------------------*/
+    MPI_Init( &argc, &argv );
+
+    /* Parse command line argv[]. */
+    for (cpp = argv+1; *cpp; ++cpp) {
+	if ( **cpp == '-' ) {
+	    c = *(*cpp+1);
+	    ++cpp;
+	    switch (c) {
+	      case 'h':
+		  printf("Options:\n");
+		  printf("\t-r <int>: process rows    (default " IFMT ")\n", nprow);
+		  printf("\t-c <int>: process columns (default " IFMT ")\n", npcol);
+		  exit(0);
+		  break;
+	      case 'r': nprow = atoi(*cpp);
+		        break;
+	      case 'c': npcol = atoi(*cpp);
+		        break;
+	    }
+	} else { /* Last arg is considered a filename */
+	    if ( !(fp = fopen(*cpp, "r")) ) {
+                ABORT("File does not exist");
+            }
+	    break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID. 
+       ------------------------------------------------------------*/
+    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
+
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if ( iam == -1 )	goto out;
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter main()");
+#endif
+    
+    /* ------------------------------------------------------------
+       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
+       THE OTHER PROCESSES.
+       ------------------------------------------------------------*/
+    if ( !iam ) {
+	/* Print the CPP definitions. */
+	cpp_defs();
+	
+	/* Read the matrix stored on disk in Harwell-Boeing format. */
+	sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa);
+	
+	printf("Input matrix file: %s\n", *cpp);
+	printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz);
+	printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &a, &asub, &xa);
+
+	MPI_Bcast( a,    nnz, MPI_FLOAT, 0, grid.comm );
+	MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
+	MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
+    }
+	
+    /* Create compressed column matrix for A. */
+    sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]");
+    if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+    ldx = n;
+    ldb = m;
+    sGenXtrue_dist(n, nrhs, xtrue, ldx);
+    sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb);
+
+    if ( !(berr = floatMalloc_dist(nrhs)) )
+	ABORT("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------*/
+
+    /* Set the default input options:
+        options.Fact = DOFACT;
+        options.Equil = YES;
+        options.ColPerm = METIS_AT_PLUS_A;
+        options.RowPerm = LargeDiag_MC64;
+        options.ReplaceTinyPivot = YES;
+        options.Trans = NOTRANS;
+        options.IterRefine = DOUBLE;
+        options.SolveInitialized = NO;
+        options.RefineInitialized = NO;
+        options.PrintStat = YES;
+     */
+    set_default_options_dist(&options);
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+    }
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    sScalePermstructInit(m, n, &ScalePermstruct);
+    sLUstructInit(n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit(&stat);
+
+    /* Call the linear equation solver. */
+    psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+		     &LUstruct, berr, &stat, &info);
+
+    /* Check the accuracy of the solution. */
+    if ( !iam ) {
+	sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid);
+    }
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------*/
+    PStatFree(&stat);
+    Destroy_CompCol_Matrix_dist(&A);
+    sDestroy_LU(n, &grid, &LUstruct);
+    sScalePermstructFree(&ScalePermstruct);
+    sLUstructFree(&LUstruct);
+    SUPERLU_FREE(b);
+    SUPERLU_FREE(xtrue);
+    SUPERLU_FREE(berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------*/
+out:
+    superlu_gridexit(&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------*/
+    MPI_Finalize();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit main()");
+#endif
+
+}
+
+
+int cpp_defs()
+{
+    printf(".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf("\tPROFlevel = %d\n", PROFlevel);
+#endif
+#if ( StaticPivot>=1 )
+    printf("\tStaticPivot = %d\n", StaticPivot);
+#endif
+    printf("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index bf6c3e9b..493997b2 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( (iam >= nprow * npcol) || (iam == -1) ) goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
      */
     set_default_options_dist(&options);
 #if 0
-    options.RowPerm = LargeDiag_HWPM;
+    options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
@@ -224,10 +224,16 @@ int main(int argc, char *argv[])
     pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-
-    /* Check the accuracy of the solution. */
-    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		         nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
@@ -240,9 +246,7 @@ int main(int argc, char *argv[])
     zScalePermstructFree(&ScalePermstruct);
     zDestroy_LU(n, &grid, &LUstruct);
     zLUstructFree(&LUstruct);
-    if ( options.SolveInitialized ) {
-        zSolveFinalize(&options, &SOLVEstruct);
-    }
+    zSolveFinalize(&options, &SOLVEstruct);
     SUPERLU_FREE(b);
     SUPERLU_FREE(xtrue);
     SUPERLU_FREE(berr);
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index 6718cd0e..e0f8ff4b 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -13,10 +13,11 @@ at the top-level directory.
  * \brief Driver program for PZGSSVX example
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * </pre>
  */
 
@@ -32,7 +33,8 @@ at the top-level directory.
  * The driver program PZDRIVE1.
  *
  * This example illustrates how to use PZGSSVX to
- * solve systems with the same A but different right-hand side.
+ * solve systems with the same A but different right-hand side,
+ * possibly with different number of right-hand sides.
  * In this case, we factorize A only once in the first call to
  * PZGSSVX, and reuse the following data structures
  * in the subsequent call to PZGSSVX:
@@ -53,8 +55,8 @@ int main(int argc, char *argv[])
     zSOLVEstruct_t SOLVEstruct;
     gridinfo_t grid;
     double   *berr;
-    doublecomplex   *b, *xtrue, *b1;
-    int    i, j, m, n;
+    doublecomplex   *b, *xtrue, *b1, *b2;
+    int    i, j, m, n, m_loc;
     int    nprow, npcol;
     int    iam, info, ldb, ldx, nrhs;
     char     **cpp, c, *postfix;
@@ -64,7 +66,7 @@ int main(int argc, char *argv[])
 
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
-    nrhs = 1;   /* Number of right-hand side. */
+    nrhs  = 3;  /* Max. number of right-hand sides. */
 
     /* ------------------------------------------------------------
        INITIALIZE MPI ENVIRONMENT. 
@@ -103,7 +105,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -140,14 +142,24 @@ int main(int argc, char *argv[])
     zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
     if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) )
         ABORT("Malloc fails for b1[]");
-    for (j = 0; j < nrhs; ++j)
-        for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb];
+    if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
 
     if ( !(berr = doubleMalloc_dist(nrhs)) )
 	ABORT("Malloc fails for berr[].");
 
+    m = A.nrow;
+    n = A.ncol;
+    m_loc = ((NRformat_loc *)A.Store)->m_loc;
+    
     /* ------------------------------------------------------------
-       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
        ------------------------------------------------------------*/
 
     /* Set the default input options:
@@ -170,9 +182,6 @@ int main(int argc, char *argv[])
 	fflush(stdout);
     }
 
-    m = A.nrow;
-    n = A.ncol;
-
     /* Initialize ScalePermstruct and LUstruct. */
     zScalePermstructInit(m, n, &ScalePermstruct);
     zLUstructInit(n, &LUstruct);
@@ -181,41 +190,90 @@ int main(int argc, char *argv[])
     PStatInit(&stat);
 
     /* Call the linear equation solver. */
+    nrhs = 1;
     pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("\tSolve the first system:\n");
-    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
 
     /* ------------------------------------------------------------
-       NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
        RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
        LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
        ------------------------------------------------------------*/
     options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
     PStatInit(&stat); /* Initialize the statistics variables. */
 
+    nrhs = 1;
     pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("\tSolve the system with a different B:\n");
-    pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b1, ldb, xtrue, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {    
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
 
+    /* ------------------------------------------------------------
+       3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+       NUMBER OF RIGHT-HAND SIDES,  WE WILL USE THE EXISTING L AND U
+       FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+       ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 3;
+    
+    /* When changing the number of RHS's, the following counters 
+       for communication messages must be reset. */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm;
+    SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
+    SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
+    SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
+    pzgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row,
+		 ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid,
+		 LUstruct.Glu_persist, &SOLVEstruct);
+    
+    pzgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid,
+	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with 3 RHS's:\n");
+        pzinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
+    PStatFree(&stat);
 
     /* ------------------------------------------------------------
        DEALLOCATE STORAGE.
        ------------------------------------------------------------*/
-    PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A);
     zScalePermstructFree(&ScalePermstruct);   
     zDestroy_LU(n, &grid, &LUstruct);
@@ -225,6 +283,7 @@ int main(int argc, char *argv[])
     }
     SUPERLU_FREE(b);
     SUPERLU_FREE(b1);
+    SUPERLU_FREE(b2);
     SUPERLU_FREE(xtrue);
     SUPERLU_FREE(berr);
     fclose(fp);
diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c
index d2a048d4..bf47169f 100644
--- a/EXAMPLE/pzdrive1_ABglobal.c
+++ b/EXAMPLE/pzdrive1_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -104,8 +109,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index df9b2263..ac042c2b 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -32,8 +32,8 @@ at the top-level directory.
  *
  * The driver program PZDRIVE2.
  *
- * This example illustrates how to use  to solve
- * systems repeatedly with the same sparsity pattern of matrix A.
+ * This example illustrates how to use PZGSSVX to solve systems
+ * repeatedly with the same sparsity pattern of matrix A.
  * In this case, the column permutation vector ScalePermstruct->perm_c is
  * computed once. The following data structures will be reused in the
  * subsequent call to PZGSSVX:
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -146,7 +146,8 @@ int main(int argc, char *argv[])
        GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
        ------------------------------------------------------------*/
     zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
-
+    fclose(fp);
+    
     if ( !(berr = doubleMalloc_dist(nrhs)) )
 	ABORT("Malloc fails for berr[].");
     m = A.nrow;
@@ -155,7 +156,7 @@ int main(int argc, char *argv[])
     m_loc = Astore->m_loc;
 
     /* ------------------------------------------------------------
-       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
        ------------------------------------------------------------*/
 
     /* Set the default input options:
@@ -189,20 +190,27 @@ int main(int argc, char *argv[])
     pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */ 
     zDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
-					the L and U matrices.               */
-    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
-    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+					the L and U matrices.  */
+    SUPERLU_FREE(b);      /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);  /* Free storage of the exact solution.*/
 
     /* ------------------------------------------------------------
-       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
-       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       	  ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
        ------------------------------------------------------------*/
     options.Fact = SamePattern;
 
@@ -217,18 +225,25 @@ int main(int argc, char *argv[])
     /* Get the matrix from file, perturbed some diagonal entries to force
        a different perm_r[]. Set up the right-hand side.   */
     if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
-    zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid);
-
+    zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb,
+                                  &xtrue1, &ldx, fp, postfix, &grid);
+			     
     PStatInit(&stat); /* Initialize the statistics variables. */
 
     /* Solve the linear system. */
     pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
-    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
-
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
 #if ( PRNTlevel>=2 )
     if (iam==0) {
 	PrintInt10("new perm_r", m, ScalePermstruct.perm_r);
diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c
index b6c00cbb..96866c35 100644
--- a/EXAMPLE/pzdrive2_ABglobal.c
+++ b/EXAMPLE/pzdrive2_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -104,8 +109,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index c00415d6..df7a7479 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -34,9 +34,9 @@ at the top-level directory.
  * This example illustrates how to use PZGSSVX to solve
  * systems repeatedly with the same sparsity pattern and similar
  * numerical values of matrix A.
- * In this case, the column permutation vector and symbolic factorization are
- * computed only once. The following data structures will be reused in the
- * subsequent call to PZGSSVX:
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PZGSSVX:
  *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
  *        LUstruct        : etree, Glu_persist, Llu
  *
@@ -112,7 +112,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -205,8 +205,15 @@ int main(int argc, char *argv[])
     pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -230,8 +237,9 @@ int main(int argc, char *argv[])
         nzval1[0].r += 1.0e-8; nzval1[0].i += 1.0e-8;
     }
 
-    /* Zero the numerical values in L.  */
+    /* Zero the numerical values in L and U.  */
     zZeroLblocks(iam, n, &grid, &LUstruct);
+    zZeroUblocks(iam, n, &grid, &LUstruct);
 
     zCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
 				   nzval1, colind1, rowptr1,
@@ -241,16 +249,23 @@ int main(int argc, char *argv[])
     pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-    /* Check the accuracy of the solution. */
-    if ( !iam )
-        printf("Solve a system with the same pattern and similar values.\n");
-    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam )
+            printf("Solve a system with the same pattern and similar values.\n");
+        pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
 
     /* Print the statistics. */
     PStatPrint(&options, &stat, &grid);
 
     /* ------------------------------------------------------------
-       DEALLOCATE STORAGE.
+       DEALLOCATE ALL STORAGE.
        ------------------------------------------------------------*/
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c
index e1f21ce1..144e3f6e 100644
--- a/EXAMPLE/pzdrive3_ABglobal.c
+++ b/EXAMPLE/pzdrive3_ABglobal.c
@@ -77,7 +77,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -110,8 +115,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
new file mode 100644
index 00000000..1d35fb24
--- /dev/null
+++ b/EXAMPLE/pzdrive3d.c
@@ -0,0 +1,420 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D.
+ *
+ * This example illustrates how to use PZGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pzgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np <p> pzdrive3d -r <proc rows> -c <proc columns> \
+ *                                   -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       d must be a power-of-two, e.g., 1, 2, 4, ...
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, doublecomplex* A, int LDA,
+       doublecomplex* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert( (A[i+ LDA*j].r == B[i+ LDB*j].r)
+	    	    && (A[i+ LDA*j].i == B[i+ LDB*j].i) );
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    doublecomplex * Aval = (doublecomplex *) A->nzval;
+    doublecomplex * Bval = (doublecomplex *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    zScalePermstruct_t ScalePermstruct;
+    zLUstruct_t LUstruct;
+    zSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    doublecomplex *b, *xtrue;
+    int_t m, n;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+#define NRFRMT
+#ifndef NRFRMT
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+	zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
+	
+#else
+    // *fp0 = *fp;
+    zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    doublecomplex* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    doublecomplex *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        zcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+    #endif
+#endif
+
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------ */
+
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+#ifdef NRFRMT  // matrix is on 3D process grid
+    m = A.nrow;
+    n = A.ncol;
+#else
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
+    {
+	m = A.nrow;
+        n = A.ncol;
+    }
+    // broadcast m, n to all the process layers;
+    MPI_Bcast( &m, 1, mpi_int_t, 0,  grid.zscp.comm);
+    MPI_Bcast( &n, 1, mpi_int_t, 0,  grid.zscp.comm);
+#endif    
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    zScalePermstructInit (m, n, &ScalePermstruct);
+    zLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        zDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        zSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    zScalePermstructFree (&ScalePermstruct);
+    zLUstructFree (&LUstruct);
+    PStatFree (&stat);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c
new file mode 100644
index 00000000..47905c56
--- /dev/null
+++ b/EXAMPLE/pzdrive3d1.c
@@ -0,0 +1,448 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D1.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PZGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PZGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> pzdrive3d1 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, doublecomplex* A, int LDA,
+       doublecomplex* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert( (A[i+ LDA*j].r == B[i+ LDB*j].r)
+	    	    && (A[i+ LDA*j].i == B[i+ LDB*j].i) );
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    doublecomplex * Aval = (doublecomplex *) A->nzval;
+    doublecomplex * Bval = (doublecomplex *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    zScalePermstruct_t ScalePermstruct;
+    zLUstruct_t LUstruct;
+    zSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    doublecomplex *b, *xtrue, *b1, *b2;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    doublecomplex* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    doublecomplex *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        zcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+#endif
+
+    /* Save two copies of the RHS */
+    if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) )
+        ABORT("Malloc fails for b1[]");
+    for (j = 0; j < nrhs; ++j) {
+        for (i = 0; i < ldb; ++i) {
+	    b1[i+j*ldb] = b[i+j*ldb];
+	    b2[i+j*ldb] = b[i+j*ldb];
+        }
+    }	    
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    zScalePermstructInit (m, n, &ScalePermstruct);
+    zLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    }
+    PStatFree (&stat);
+    fflush(stdout);
+
+   /* ------------------------------------------------------------
+     2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT
+     RIGHT-HAND SIDE,  WE WILL USE THE EXISTING L AND U FACTORS IN
+     LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
+     ------------------------------------------------------------*/
+    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+            printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+            fflush(stdout);
+        }
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the system with a different B:\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                            nrhs, b1, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        zDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        zSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (b2);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    zScalePermstructFree (&ScalePermstruct);
+    zLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive3d2.c b/EXAMPLE/pzdrive3d2.c
new file mode 100644
index 00000000..759b2afb
--- /dev/null
+++ b/EXAMPLE/pzdrive3d2.c
@@ -0,0 +1,424 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D2.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PZGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np <p> pzdrive3d2 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, doublecomplex* A, int LDA,
+       doublecomplex* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert( (A[i+ LDA*j].r == B[i+ LDB*j].r)
+	    	    && (A[i+ LDA*j].i == B[i+ LDB*j].i) );
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    doublecomplex * Aval = (doublecomplex *) A->nzval;
+    doublecomplex * Bval = (doublecomplex *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    zScalePermstruct_t ScalePermstruct;
+    zLUstruct_t LUstruct;
+    zSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    doublecomplex *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level;
+
+    /* prototypes */
+    extern int zcreate_matrix_perturbed
+        (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *,
+         FILE *, gridinfo_t *);
+    extern int zcreate_matrix_perturbed_postfix
+        (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *,
+         FILE *, char *, gridinfo_t *);
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    zScalePermstructInit (m, n, &ScalePermstruct);
+    zLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, keep around 2D matrix meta structure */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+        /* Deallocate storage associated with the L and U matrices.*/
+	zDestroy_LU(n, &(grid.grid2d), &LUstruct);
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern;
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                         &xtrue1, &ldx, fp, suffix, &(grid));
+    
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+ 
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	   printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+    	   fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        zDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        zSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization
+
+    zScalePermstructFree (&ScalePermstruct);
+    zLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive3d3.c b/EXAMPLE/pzdrive3d3.c
new file mode 100644
index 00000000..9eab4a11
--- /dev/null
+++ b/EXAMPLE/pzdrive3d3.c
@@ -0,0 +1,430 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D3.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PZGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PZGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np <p> pzdrive3d3 -r <proc rows> -c <proc columns> \
+ *                                    -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       (d must be a power-of-two, e.g., 1, 2, 4, ...)
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, doublecomplex* A, int LDA,
+       doublecomplex* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert( (A[i+ LDA*j].r == B[i+ LDB*j].r)
+	    	    && (A[i+ LDA*j].i == B[i+ LDB*j].i) );
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    doublecomplex * Aval = (doublecomplex *) A->nzval;
+    doublecomplex * Bval = (doublecomplex *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    zScalePermstruct_t ScalePermstruct;
+    zLUstruct_t LUstruct;
+    zSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    doublecomplex *b, *b1, *xtrue, *xtrue1;
+    int m, n, i, j, m_loc;
+    int nprow, npcol, npdep;
+    int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+
+    if (grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+    // *fp0 = *fp;
+    zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS.
+       ------------------------------------------------------------*/
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+#if 0
+    options.RowPerm = NOROWPERM;
+    options.IterRefine = NOREFINE;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    // matrix is on 3D process grid  
+    m = A.nrow;
+    n = A.ncol;
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    zScalePermstructInit (m, n, &ScalePermstruct);
+    zLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("\tSolve the first system:\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+    
+    /* Deallocate some storage, including replicated LU structure along
+       the Z dimension. keep around 2D matrix meta structure, including
+       the LU data structure on the host side.  */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    
+    if ( (grid.zscp).Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    PStatFree(&stat);
+    SUPERLU_FREE(b);     /* Free storage of right-hand side.*/
+    SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/
+
+    /* ------------------------------------------------------------
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+          ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       ------------------------------------------------------------*/
+    options.Fact = SamePattern_SameRowPerm;
+    
+    /* Zero the numerical values in L and U.  */
+    if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */
+        zZeroLblocks(iam, n, &(grid.grid2d), &LUstruct);
+        zZeroUblocks(iam, n, &(grid.grid2d), &LUstruct);
+    }
+
+    /* Get the matrix from file, perturbed some diagonal entries to force
+       a different perm_r[]. Set up the right-hand side.   */
+    if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
+    zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb,
+                             &xtrue1, &ldx, fp, suffix, &(grid));
+    fclose(fp);
+
+    PStatInit(&stat); /* Initialize the statistics variables. */
+
+    nrhs = 1;
+    pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        if ( !iam ) printf("Solve a system with the same pattern and similar values.\n");
+        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b1, ldb, xtrue1, ldx, grid.comm);
+    }
+    
+    /* ------------------------------------------------------------
+       DEALLOCATE ALL STORAGE.
+       ------------------------------------------------------------ */
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
+        zDestroy_LU (n, &(grid.grid2d), &LUstruct);
+        zSolveFinalize (&options, &SOLVEstruct);
+    } else { // Process layers not equal 0
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
+    }
+    
+    zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    zScalePermstructFree (&ScalePermstruct);
+    zLUstructFree (&LUstruct);
+    PStatFree (&stat);
+    SUPERLU_FREE (b1);
+    SUPERLU_FREE (xtrue1);
+    SUPERLU_FREE (berr);
+    fclose(fp);
+
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    superlu_gridexit3d (&grid);
+
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 3b768f81..1b95ded7 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -60,7 +60,7 @@ int main(int argc, char *argv[])
     int_t    *asub, *xa;
     int_t    i, j, m, n;
     int      nprow, npcol, ldumap, p;
-    int_t    usermap[6];
+    int    usermap[6];
     int      iam, info, ldb, ldx, nprocs;
     int      nrhs = 1;   /* Number of right-hand side. */
     int ii, omp_mpi_level;
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
@@ -190,9 +190,16 @@ int main(int argc, char *argv[])
 	pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1,
                 &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-        /* Check the accuracy of the solution. */
-        pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid1);
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+    	        fflush(stdout);
+    	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid1.comm);
+	}
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid1);
@@ -255,10 +262,17 @@ int main(int argc, char *argv[])
 	pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2,
                 &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-        /* Check the accuracy of the solution. */
-        pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid2);
-    
+        if ( info ) {  /* Something is wrong */
+            if ( iam==0 ) {
+	        printf("ERROR: INFO = %d returned from pzgssvx()\n", info);
+	        fflush(stdout);
+	    }
+        } else {
+            /* Check the accuracy of the solution. */
+            pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+                             nrhs, b, ldb, xtrue, ldx, grid2.comm);
+        }
+	
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid2);
 
diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c
index 4ec17583..9b3ff81b 100644
--- a/EXAMPLE/pzdrive4_ABglobal.c
+++ b/EXAMPLE/pzdrive4_ABglobal.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
     int_t    *asub, *xa;
     int_t    i, j, m, n, nnz;
     int_t    nprow, npcol, ldumap, p;
-    int_t    usermap[6];
+    int    usermap[6];
     int      iam, info, ldb, ldx, nprocs;
     int      nrhs = 1;   /* Number of right-hand side. */
     char     trans[1];
@@ -70,6 +70,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
@@ -125,7 +131,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c
index 5871f7b8..b6f48554 100644
--- a/EXAMPLE/pzdrive_ABglobal.c
+++ b/EXAMPLE/pzdrive_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
@@ -105,9 +110,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
-
+    if ( iam == -1 )	goto out;
 
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c
index faf725c6..8dab3751 100755
--- a/EXAMPLE/pzdrive_spawn.c
+++ b/EXAMPLE/pzdrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
@@ -151,7 +156,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -247,7 +252,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
@@ -266,7 +271,7 @@ int main(int argc, char *argv[])
 		// result[1] = total * 1e-6;     
 		if (!iam) {
 			printf("returning data:\n"
-		   "    Factor time :        %8.2f |  Total MEM : %8.2f\n",
+		   "    Factor time :        %8.2f\n    Total MEM : %8.2f\n",
 		   stat.utime[FACT], total * 1e-6);
 		   printf("    Solve time :        %8.2f \n",
 		   stat.utime[SOLVE]);           
@@ -305,6 +310,7 @@ int main(int argc, char *argv[])
        RELEASE THE SUPERLU PROCESS GRID.
        ------------------------------------------------------------*/
 out:
+	if(parent!=MPI_COMM_NULL)
 	MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent);
     superlu_gridexit(&grid);
 
@@ -312,7 +318,7 @@ int main(int argc, char *argv[])
        TERMINATES THE MPI EXECUTION ENVIRONMENT.
        ------------------------------------------------------------*/
 	   
-	
+    if(parent!=MPI_COMM_NULL)
 	MPI_Comm_disconnect(&parent);
     MPI_Finalize();
 
diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c
index 8efa3285..153dca2c 100644
--- a/EXAMPLE/pzutil.c
+++ b/EXAMPLE/pzutil.c
@@ -1,15 +1,15 @@
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
 
-All rights reserved. 
+All rights reserved.
 
 The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-/*! @file 
+/*! @file
  * \brief Several matrix utilities
  *
  * <pre>
@@ -38,15 +38,15 @@ int pzCompRow_loc_to_CompCol_global
     int_t *colind, *rowptr;
     int_t *colptr_loc, *rowind_loc;
     int_t m_loc, n, i, j, k, l;
-    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
+    int_t colnnz, fst_row, nnz_loc, nnz;
     doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
     doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
-    int_t *colcnt, *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
 			   local block rows.
 			   Use n_loc+1 pointers for each block. */
     int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
+			   redistribution to the local block columns.
 			   Use n_loc+1 pointers for each block. */
     int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
     int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
@@ -164,7 +164,7 @@ int pzCompRow_loc_to_CompCol_global
                       a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                       grid->comm);
     }
-      
+
     /* Reset colptr_loc[] to point to the n_loc global columns. */
     colptr_loc[0] = 0;
     itemp = colptr_send;
@@ -178,7 +178,7 @@ int pzCompRow_loc_to_CompCol_global
 	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
     }
     itemp[n_loc] = colptr_loc[n_loc];
-      
+
     /* Merge blocks of row indices into columns of row indices. */
     for (i = 0; i < procs; ++i) {
         k = i * (n_loc + 1);
@@ -219,12 +219,12 @@ int pzCompRow_loc_to_CompCol_global
     MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
     for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
     GAstore->nnz = nnz;
-    
+
     if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
     if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
+
     /* Allgatherv for row indices. */
     rdispls[0] = 0;
     for (i = 0; i < procs-1; ++i) {
@@ -233,12 +233,12 @@ int pzCompRow_loc_to_CompCol_global
     }
     itemp_32[procs-1] = itemp[procs-1];
     it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
     if ( need_value ) {
       if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
           ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval,
 		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
     } else GAstore->nzval = NULL;
 
@@ -249,7 +249,7 @@ int pzCompRow_loc_to_CompCol_global
         itemp_32[i] = n_locs[i];
     }
     itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
 
     /* Recompute column pointers. */
@@ -277,7 +277,7 @@ int pzCompRow_loc_to_CompCol_global
     SUPERLU_FREE(rowind_recv);
     if ( need_value) SUPERLU_FREE(a_recv);
 #if ( DEBUGlevel>=1 )
-    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
     CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
 #endif
     return 0;
@@ -371,7 +371,7 @@ int pzPermute_Dense_Matrix
 	++ptr_to_ibuf[p];
 	ptr_to_dbuf[p] += nrhs;
     }
-	  
+
     /* Transfer the (permuted) row indices and numerical values. */
     MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
 		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
@@ -397,12 +397,299 @@ int pzPermute_Dense_Matrix
 } /* pzPermute_Dense_Matrix */
 
 
+/*! \brief Allocate storage in LUstruct */
+void zLUstructInit(const int_t n, zLUstruct_t *LUstruct)
+{
+    if ( !(LUstruct->etree = intMalloc_dist(n)) )
+	ABORT("Malloc fails for etree[].");
+    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
+	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
+	ABORT("Malloc fails for Glu_persist_t.");
+    if ( !(LUstruct->Llu = (zLocalLU_t *)
+	   SUPERLU_MALLOC(sizeof(zLocalLU_t))) )
+	ABORT("Malloc fails for LocalLU_t.");
+	LUstruct->Llu->inv = 0;
+}
+
+/*! \brief Deallocate LUstruct */
+void zLUstructFree(zLUstruct_t *LUstruct)
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zLUstructFree()");
+#endif
+
+    SUPERLU_FREE(LUstruct->etree);
+    SUPERLU_FREE(LUstruct->Glu_persist);
+    SUPERLU_FREE(LUstruct->Llu);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zLUstructFree()");
+#endif
+}
+
+/*! \brief Destroy distributed L & U matrices. */
+void
+zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zDestroy_LU()");
+#endif
+
+    zDestroy_Tree(n, grid, LUstruct);
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc    
+    //#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#endif
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nb = CEILING(nsupers, grid->nprow);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    SUPERLU_FREE(Llu->ToSendR[0]);
+    SUPERLU_FREE(Llu->ToSendR);
+
+    /* The following can be freed only after iterative refinement. */
+    SUPERLU_FREE(Llu->ilsum);
+    SUPERLU_FREE(Llu->fmod);
+    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE(Llu->fsendx_plist);
+    SUPERLU_FREE(Llu->bmod);
+    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE(Llu->bsendx_plist);
+    SUPERLU_FREE(Llu->mod_bit);
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+	}	
+    SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0; i<nb; ++i) {
+	if(Llu->Linv_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+	}
+	if(Llu->Uinv_bc_ptr[i]!=NULL){
+	    SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+	}	
+    }
+    SUPERLU_FREE(Llu->Linv_bc_ptr);
+    SUPERLU_FREE(Llu->Uinv_bc_ptr);
+    SUPERLU_FREE(Llu->Unnz);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Urbs[i] ) {
+	    SUPERLU_FREE(Llu->Ucb_indptr[i]);
+	    SUPERLU_FREE(Llu->Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Llu->Ucb_indptr);
+    SUPERLU_FREE(Llu->Ucb_valptr);	
+    SUPERLU_FREE(Llu->Urbs);
+
+    SUPERLU_FREE(Glu_persist->xsup);
+    SUPERLU_FREE(Glu_persist->supno);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zDestroy_LU()");
+#endif
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * </pre>
+ */
+int_t
+pzgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
+	     int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
+	     Glu_persist_t *Glu_persist, zSOLVEstruct_t *SOLVEstruct)
+{
+
+    int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *row_to_proc;
+    int_t i, gbi, k, l, num_diag_procs, *diag_procs;
+    int_t irow, q, knsupc, nsupers, *xsup, *supno;
+    int   iam, p, pkk, procs;
+    pxgstrs_comm_t *gstrs_comm;
+
+    procs = grid->nprow * grid->npcol;
+    iam = grid->iam;
+    gstrs_comm = SOLVEstruct->gstrs_comm;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    row_to_proc = SOLVEstruct->row_to_proc;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
+       ------------------------------------------------------------*/
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for B_to_X_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of elements to be sent to each diagonal process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	++SendCnt[p];
+    }
+  
+    /* Set up the displacements for alltoall. */
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+    }
+    for (p = 0; p < procs; ++p) {
+        SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	sdispls_nrhs[p] = sdispls[p] * nrhs;
+        RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+	rdispls_nrhs[p] = rdispls[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->B_to_X_SendCnt = SendCnt;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
+       ------------------------------------------------------------*/
+    /* This is freed in pxgstrs_finalize(). */
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for X_to_B_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		irow = FstBlockC( k );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    q = row_to_proc[inv_perm_c[irow]];
+#else
+		    q = row_to_proc[irow];
+#endif
+		    ++SendCnt[q];
+		    ++irow;
+		}
+	    }
+	}
+    }
+
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    SendCnt_nrhs[0] = SendCnt[0] * nrhs;
+    RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+        sdispls_nrhs[p] = sdispls[p] * nrhs;
+        rdispls_nrhs[p] = rdispls[p] * nrhs;
+	SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->X_to_B_SendCnt = SendCnt;
+
+    if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for ptr_to_ibuf[].");
+    gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
+    gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    return 0;
+} /* PZGSTRS_INIT */
+
+
 /*! \brief Initialize the data structure for the solution phase.
  */
-int zSolveInit(superlu_options_t *options, SuperMatrix *A, 
+int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
 	       int_t perm_r[], int_t perm_c[], int_t nrhs,
-	       LUstruct_t *LUstruct, gridinfo_t *grid,
-	       SOLVEstruct_t *SOLVEstruct)
+	       zLUstruct_t *LUstruct, gridinfo_t *grid,
+	       zSOLVEstruct_t *SOLVEstruct)
 {
     int_t *row_to_proc, *inv_perm_c, *itemp;
     NRformat_loc *Astore;
@@ -413,29 +700,21 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A,
     fst_row = Astore->fst_row;
     m_loc = Astore->m_loc;
     procs = grid->nprow * grid->npcol;
-    
-    if ( !grid->iam ) printf("@@@ enter zSolveInit, A->nrow %d\n", A->nrow);
 
     if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
 	ABORT("Malloc fails for row_to_proc[]");
-    if ( !grid->iam ) { printf("@@@ malloc(1) zSolveInit\n"); fflush(stdout); }
     SOLVEstruct->row_to_proc = row_to_proc;
-
     if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
         ABORT("Malloc fails for inv_perm_c[].");
-    if ( !grid->iam ) { printf("@@@ malloc(2) zSolveInit\n"); fflush(stdout); }
-
     for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
     SOLVEstruct->inv_perm_c = inv_perm_c;
 
-    if ( !grid->iam ) printf("@@@ after malloc zSolveInit\n");
-
     /* ------------------------------------------------------------
        EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
        SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
-       
+
        NOTE: For those processes that do not own any row, it must
-             must be set so that fst_row == A->nrow. 
+             must be set so that fst_row == A->nrow.
        ------------------------------------------------------------*/
     if ( !(itemp = intMalloc_dist(procs+1)) )
         ABORT("Malloc fails for itemp[]");
@@ -445,11 +724,6 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A,
     for (p = 0; p < procs; ++p) {
         for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
     }
-
-    if ( !grid->iam ) printf("@@@ after allgather zSolveInit\n");
-
-#define DEBUGlevel 2
-
 #if ( DEBUGlevel>=2 )
     if ( !grid->iam ) {
       printf("fst_row = %d\n", fst_row);
@@ -475,34 +749,35 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A,
 	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
 	}
     }
-#endif    
+#endif
 
     get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
 		   &SOLVEstruct->num_diag_procs,
 		   &SOLVEstruct->diag_procs,
 		   &SOLVEstruct->diag_len);
 
+    /* Setup communication pattern for redistribution of B and X. */
     if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
 	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
         ABORT("Malloc fails for gstrs_comm[]");
-    pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, 
+    pzgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
 		 LUstruct->Glu_persist, SOLVEstruct);
 
     if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *)
            SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) )
         ABORT("Malloc fails for gsmv_comm[]");
     SOLVEstruct->A_colind_gsmv = NULL;
-    
+
     options->SolveInitialized = YES;
     return 0;
 } /* zSolveInit */
 
 /*! \brief Release the resources used for the solution phase.
  */
-void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct)
+void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct)
 {
-    int_t *it;
     pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
     if ( options->RefineInitialized ) {
         pzgsmv_finalize(SOLVEstruct->gsmv_comm);
 	options->RefineInitialized = NO;
@@ -512,19 +787,19 @@ void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct)
     SUPERLU_FREE(SOLVEstruct->inv_perm_c);
     SUPERLU_FREE(SOLVEstruct->diag_procs);
     SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
+    if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
     options->SolveInitialized = NO;
 } /* zSolveFinalize */
 
-/*! \brief Check the inf-norm of the error vector 
+/*! \brief Check the inf-norm of the error vector
  */
 void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
-		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) 
+		      doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm)
 {
     double err, xnorm, temperr, tempxnorm;
     doublecomplex *x_work, *xtrue_work;
     doublecomplex temp;
-    int i, j;
+    int i, j, ii;
 
     for (j = 0; j < nrhs; j++) {
       x_work = &x[j*ldx];
@@ -534,16 +809,83 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx
         z_sub(&temp, &x_work[i], &xtrue_work[i]);
 	err = SUPERLU_MAX(err, slud_z_abs(&temp));
 	xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i]));
+#if 1
+	if (err > 1.e-4 && iam == 1) {
+	  ii = i;
+	  printf("(wrong proc %d) wrong index ii %d\n", iam, ii);
+	  PrintDoublecomplex("x_work(ii)", 5, &x[ii]);
+	  PrintDoublecomplex("x_true(ii)", 5, &xtrue_work[ii]);
+	  fflush(stdout);
+	  break;
+	}
+#endif
       }
 
+#if 0
+      printf("\t(%d) loc n %d: err = %e\txnorm = %e\n", iam, n, err, xnorm);
+      if (iam == 4) {
+	printf("ii %d\n", ii);
+	PrintDoublecomplex("x_work", 5, x);
+	PrintDoublecomplex("x_true", 5, xtrue_work);
+      }
+      fflush(stdout);
+#endif
       /* get the golbal max err & xnrom */
       temperr = err;
       tempxnorm = xnorm;
-      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm);
 
       err = err / xnorm;
-      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+      if ( !iam ) {
+	printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+	fflush(stdout);
+      }
+    }
+}
+
+/*! \brief Destroy broadcast and reduction trees used in triangular solve */
+void
+zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter Destroy_Tree()");
+#endif
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0;i<nb;++i){
+	if(Llu->LBtree_ptr[i]!=NULL){
+		BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+	}
+	if(Llu->UBtree_ptr[i]!=NULL){
+		BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
+	}		
+    }
+    SUPERLU_FREE(Llu->LBtree_ptr);
+    SUPERLU_FREE(Llu->UBtree_ptr);
+	
+    nb = CEILING(nsupers, grid->nprow);
+    for (i=0;i<nb;++i){
+	if(Llu->LRtree_ptr[i]!=NULL){
+		RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+	}
+	if(Llu->URtree_ptr[i]!=NULL){
+		RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
+	}		
     }
+    SUPERLU_FREE(Llu->LRtree_ptr);
+    SUPERLU_FREE(Llu->URtree_ptr);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
+#endif
 }
 
+
diff --git a/EXAMPLE/screate_matrix.c b/EXAMPLE/screate_matrix.c
new file mode 100644
index 00000000..2fe905bf
--- /dev/null
+++ b/EXAMPLE/screate_matrix.c
@@ -0,0 +1,427 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int screate_matrix(SuperMatrix *A, int nrhs, float **rhs,
+                   int *ldb, float **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter screate_matrix()");
+#endif
+
+    if ( !iam ) {
+        double t = SuperLU_timer_();
+
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) sPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_S, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit screate_matrix()");
+#endif
+    return 0;
+}
+
+
+
+int screate_matrix_postfix(SuperMatrix *A, int nrhs, float **rhs,
+                   int *ldb, float **x, int *ldx,
+                   FILE *fp, char * postfix, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter screate_matrix()");
+#endif
+
+    if ( !iam ) {
+    double t = SuperLU_timer_(); 
+
+    if(!strcmp(postfix,"rua")){
+		/* Read the matrix stored on disk in Harwell-Boeing format. */
+		sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"mtx")){
+		/* Read the matrix stored on disk in Matrix Market format. */
+		sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"rb")){
+		/* Read the matrix stored on disk in Rutherford-Boeing format. */
+		sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"dat")){
+		/* Read the matrix stored on disk in triplet format. */
+		sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"datnh")){
+		/* Read the matrix stored on disk in triplet format (without header). */
+		sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"bin")){
+		/* Read the matrix stored on disk in binary format. */
+		sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else {
+		ABORT("File format not known");
+	}
+
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+			
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) sPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_S, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit screate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/screate_matrix3d.c b/EXAMPLE/screate_matrix3d.c
new file mode 100644
index 00000000..5cb123d0
--- /dev/null
+++ b/EXAMPLE/screate_matrix3d.c
@@ -0,0 +1,463 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int screate_matrix3d(SuperMatrix *A, int nrhs, float **rhs,
+                     int *ldb, float **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_D, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
+
+
+int screate_matrix_postfix3d(SuperMatrix *A, int nrhs, float **rhs,
+                           int *ldb, float **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter screate_matrix_postfix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        if (!strcmp(postfix, "rua"))
+        {
+            /* Read the matrix stored on disk in Harwell-Boeing format. */
+            sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "mtx"))
+        {
+            /* Read the matrix stored on disk in Matrix Market format. */
+            sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "rb"))
+        {
+            /* Read the matrix stored on disk in Rutherford-Boeing format. */
+            sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "dat"))
+        {
+            /* Read the matrix stored on disk in triplet format. */
+            sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "datnh"))
+        {
+            /* Read the matrix stored on disk in triplet format (without header). */
+            sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "bin"))
+        {
+            /* Read the matrix stored on disk in binary format. */
+            sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else
+        {
+            ABORT("File format not known");
+        }
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_S, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/screate_matrix_perturbed.c b/EXAMPLE/screate_matrix_perturbed.c
new file mode 100644
index 00000000..0f1d6625
--- /dev/null
+++ b/EXAMPLE/screate_matrix_perturbed.c
@@ -0,0 +1,419 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * December 31, 2016
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX_PERTURBED read the matrix from data file in
+ * Harwell-Boeing format, and distribute it to processors in a distributed
+ * compressed row format. It also generate the distributed true solution X
+ * and the right-hand side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int screate_matrix_perturbed(SuperMatrix *A, int nrhs, float **rhs,
+                   int *ldb, float **x, int *ldx,
+                   FILE *fp, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter screate_matrix()");
+#endif
+
+    if ( !iam ) {
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+    /* Perturbed the 1st and last diagonal of the matrix to lower
+       values. Intention is to change perm_r[].   */
+    nzval[0] *= 0.01;
+    nzval[nnz-1] *= 0.0001; 
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) sPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_S, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit screate_matrix()");
+#endif
+    return 0;
+}
+
+
+
+int screate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, float **rhs,
+                   int *ldb, float **x, int *ldx,
+                   FILE *fp, char *postfix, gridinfo_t *grid)
+{
+    SuperMatrix GA;              /* global A */
+    float   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    float   *nzval;             /* global */
+    float   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter screate_matrix()");
+#endif
+
+    if ( !iam ) {
+    double t = SuperLU_timer_();       
+    if(!strcmp(postfix,"rua")){
+		/* Read the matrix stored on disk in Harwell-Boeing format. */
+		sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"mtx")){
+		/* Read the matrix stored on disk in Matrix Market format. */
+		sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"rb")){
+		/* Read the matrix stored on disk in Rutherford-Boeing format. */
+		sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"dat")){
+		/* Read the matrix stored on disk in triplet format. */
+		sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"bin")){
+		/* Read the matrix stored on disk in binary format. */
+		sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else {
+		ABORT("File format not known");
+	}
+
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+			
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( nzval,  nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, grid->comm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, grid->comm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid->comm );
+
+	/* Allocate storage for compressed column representation. */
+	sallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_FLOAT, 0, grid->comm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid->comm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
+    }
+
+    /* Perturbed the 1st and last diagonal of the matrix to lower
+       values. Intention is to change perm_r[].   */
+    nzval[0] *= 0.01;
+    nzval[nnz-1] *= 0.0001; 
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid->nprow * grid->npcol) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (grid->nprow * grid->npcol - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_S, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = floatMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    sGenXtrue_dist(n, nrhs, xtrue_global, n);
+    sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (float *) floatMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) sPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_S, SLU_GE);
+    
+    /* Get the local B */
+    if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    (*rhs)[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */    
+    *ldx = m_loc;
+    if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	(*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit screate_matrix()");
+#endif
+    return 0;
+}
diff --git a/EXAMPLE/sp_ienv.c b/EXAMPLE/sp_ienv.c
index 195cf8c2..651a1be7 100644
--- a/EXAMPLE/sp_ienv.c
+++ b/EXAMPLE/sp_ienv.c
@@ -67,8 +67,8 @@ at the top-level directory.
 #include <stdio.h>
 
 
-int_t
-sp_ienv_dist(int_t ispec)
+int
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c
new file mode 100644
index 00000000..18a5a4e8
--- /dev/null
+++ b/EXAMPLE/zcreate_matrix3d.c
@@ -0,0 +1,462 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Read the matrix from data file
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * ZCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * </pre>
+ */
+
+int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                     int *ldb, doublecomplex **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    doublecomplex   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter dcreate_matrix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        /* Read the matrix stored on disk in Harwell-Boeing format. */
+        zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    zGenXtrue_dist(n, nrhs, xtrue_global, n);
+    zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_D, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
+
+
+int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                           int *ldb, doublecomplex **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d)
+{
+    SuperMatrix GA;              /* global A */
+    doublecomplex   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;   /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;   /* local */
+    int_t    m, n, nnz;
+    int_t    m_loc, fst_row, nnz_loc;
+    int_t    m_loc_fst; /* Record m_loc of the first p-1 processors,
+               when mod(m, p) is not zero. */
+    int_t    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+    int_t      *marker;
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter zcreate_matrix_postfix3d()");
+#endif
+
+    if ( !iam )
+    {
+        double t = SuperLU_timer_();
+
+        if(!strcmp(postfix,"cua"))
+        {
+            /* Read the matrix stored on disk in Harwell-Boeing format. */
+            zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "mtx"))
+        {
+            /* Read the matrix stored on disk in Matrix Market format. */
+            zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "rb"))
+        {
+            /* Read the matrix stored on disk in Rutherford-Boeing format. */
+            zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "dat"))
+        {
+            /* Read the matrix stored on disk in triplet format. */
+            zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "datnh"))
+        {
+            /* Read the matrix stored on disk in triplet format (without header). */
+            zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else if (!strcmp(postfix, "bin"))
+        {
+            /* Read the matrix stored on disk in binary format. */
+            zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+        }
+        else
+        {
+            ABORT("File format not known");
+        }
+
+        printf("Time to read and distribute matrix %.2f\n",
+               SuperLU_timer_() - t);  fflush(stdout);
+
+        /* Broadcast matrix A to the other PEs. */
+        MPI_Bcast( &m,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,     1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm );
+        MPI_Bcast( rowind, nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr, n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+    else
+    {
+        /* Receive matrix A from PE 0. */
+        MPI_Bcast( &m,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &n,   1,   mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid3d->comm );
+
+        /* Allocate storage for compressed column representation. */
+        zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+        MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm );
+        MPI_Bcast( rowind,  nnz, mpi_int_t,  0, grid3d->comm );
+        MPI_Bcast( colptr,  n + 1, mpi_int_t,  0, grid3d->comm );
+    }
+
+#if 0
+    nzval[0] = 0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+                                SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    zGenXtrue_dist(n, nrhs, xtrue_global, n);
+    zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc + 1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+        for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j)
+    {
+        row = fst_row + j;
+        rowptr[j + 1] = rowptr[j] + marker[row];
+        marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i)
+    {
+        for (j = colptr[i]; j < colptr[i + 1]; ++j)
+        {
+            row = rowind[j];
+            if ( (row >= fst_row) && (row < fst_row + m_loc) )
+            {
+                row = row - fst_row;
+                relpos = marker[row];
+                colind[relpos] = i;
+                nzval_loc[relpos] = nzval[j];
+                ++marker[row];
+            }
+        }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+                                   nzval_loc, colind, rowptr,
+                                   SLU_NR_loc, SLU_Z, SLU_GE);
+
+    /* Get the local B */
+    if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) )
+        ABORT("Malloc fails for rhs[]");
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+        {
+            row = fst_row + i;
+            (*rhs)[j * m_loc + i] = b_global[j * n + row];
+        }
+    }
+    *ldb = m_loc;
+
+    /* Set the true X */
+    *ldx = m_loc;
+    if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) )
+        ABORT("Malloc fails for x_loc[]");
+
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j)
+    {
+        for (i = 0; i < m_loc; ++i)
+            (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit dcreate_matrix()");
+#endif
+    return 0;
+}
diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt
index 16c9c615..4390f0b7 100644
--- a/FORTRAN/CMakeLists.txt
+++ b/FORTRAN/CMakeLists.txt
@@ -1,41 +1,102 @@
-# Sherry; may not need it?
+# include the paths for header files
 include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
+include_directories(${SuperLU_DIST_BINARY_DIR}/FORTRAN)
 
-# Fortran stuff
+set(sources "superlu_c2f_wrap.c")  # initialize precision-independent file
+
+if(enable_double)
+  list(APPEND sources c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c)
+endif()
+if(enable_complex16)
+  list(APPEND sources c2f_zcreate_matrix_x_b.c superlu_c2f_zwrap.c)
+endif()  
+
+add_library(superlu_dist_fortran ${sources})
+set(targets superlu_dist_fortran)
+get_target_property(superlu_dist_version superlu_dist VERSION)
+get_target_property(superlu_dist_soversion superlu_dist SOVERSION)
+set_target_properties(superlu_dist_fortran PROPERTIES VERSION ${superlu_dist_version})
+set_target_properties(superlu_dist_fortran PROPERTIES SOVERSION ${superlu_dist_soversion})
+target_link_libraries(superlu_dist_fortran superlu_dist)
+
+# depends on FPP defs
+add_dependencies(superlu_dist_fortran config_f)
+
+install(TARGETS superlu_dist_fortran
+# DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
+    LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
+    ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
+)
+
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING PATTERN *.mod
+  )
+install(FILES superlu_dist_config.fh
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+
+# Fortran MPI stuff
 add_definitions(${MPI_Fortran_COMPILE_FLAGS})
 include_directories(${MPI_Fortran_INCLUDE_PATH})
 link_directories(${MPI_Fortran_LIBRARIES})
 
-# Libs linked to all of the examples
-set(all_link_libs superlu_dist ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+# Libs to be linked with the Fortran codes
+set(fortran_link_libs superlu_dist_fortran ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+set(all_link_libs ${fortran_link_libs} superlu_dist)
+
 #message("!!! in Fortran: MPI_Fortran_LINK_FLAGS='${MPI_Fortran_LINK_FLAGS}'")
 #message("!!! in Fortran: all_link_libs='${all_link_libs}'")
-message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
+#message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
 if (NOT MSVC)
   list(APPEND all_link_libs m)
 endif ()
 
-set(F_MOD superlupara.f90 superlu_mod.f90)
 
-if(enable_double)
-  set(C_DWRAP dcreate_dist_matrix.c superlu_c2f_dwrap.c)
-  set(F_DEXM ${F_MOD} dhbcode1.f90 f_pddrive.f90 ${C_DWRAP})
+add_library(ftestmod STATIC superlupara.f90 superlu_mod.f90)
+
+#if(enable_double)
+if(FALSE)
+  set(F_DEXM ${F_MOD} f_pddrive.F90)
   add_executable(f_pddrive ${F_DEXM})
-  target_link_libraries(f_pddrive ${all_link_libs})
+  target_link_libraries(f_pddrive ftestmod ${all_link_libs})
+  # set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
-  set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c ${C_DWRAP})
+  set(F_DEXM3D f_pddrive3d.F90)
+  add_executable(f_pddrive3d ${F_DEXM3D})
+  target_link_libraries(f_pddrive3d ftestmod ${all_link_libs})
+  set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
+  set(F_5x5 f_5x5.F90 sp_ienv.c)
   add_executable(f_5x5 ${F_5x5})
-  target_link_libraries(f_5x5 ${all_link_libs})
-#  set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE Fortran)
+  target_link_libraries(f_5x5 ftestmod ${all_link_libs})
   set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
 endif()
 
-if(enable_complex16)
-  set(C_ZWRAP zcreate_dist_matrix.c superlu_c2f_zwrap.c)
-  set(F_ZEXM ${F_MOD} zhbcode1.f90 f_pzdrive.f90 ${C_ZWRAP})
+# if(enable_complex16)
+if(FALSE)
+  set(F_ZEXM f_pzdrive.F90)
   add_executable(f_pzdrive ${F_ZEXM})
-  target_link_libraries(f_pzdrive ${all_link_libs})
+  target_link_libraries(f_pzdrive ftestmod ${all_link_libs})
 #  set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+
+  set(F_ZEXM3D f_pzdrive3d.F90)
+  add_executable(f_pzdrive3d ${F_ZEXM3D})
+  target_link_libraries(f_pzdrive3d ftestmod ${all_link_libs})
+  set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
 endif()
+
+
+# Format superlu_dist_config.fh from superlu_dist_config.h in C
+add_custom_command(
+  OUTPUT superlu_dist_config.fh
+  COMMAND sed;'/^\\//;d';<;superlu_dist_config.h;>;superlu_dist_config.fh
+  COMMAND cp;superlu_dist_config.fh;${SuperLU_DIST_SOURCE_DIR}/FORTRAN/.
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/FORTRAN
+)
+add_custom_target(config_f DEPENDS superlu_dist_config.fh)
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index d275c360..008e3431 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -8,29 +8,38 @@
 #
 #######################################################################
 .SUFFIXES: 
-.SUFFIXES: .f90 .c .o
+.SUFFIXES: .f90 .F90 .c .o
 include ../make.inc
 
 #F90FLAGS	= $(FFLAGS) -qfree -qsuffix=f=f90  -qflag=w:w
 
 F_MOD	= superlupara.o superlu_mod.o
-C_DWRAP	= dcreate_dist_matrix.o superlu_c2f_dwrap.o
-C_ZWRAP	= zcreate_dist_matrix.o superlu_c2f_zwrap.o
+#C_DWRAP	= c2f_dcreate_matrix_x_b.o superlu_c2f_dwrap.o 
+#C_ZWRAP	= c2f_zcreate_matrix_x_b.o superlu_c2f_zwrap.o
 
-F_DEXM	= $(F_MOD) dhbcode1.o f_pddrive.o
-F_ZEXM	= $(F_MOD) zhbcode1.o f_pzdrive.o
-F_5x5 	= $(F_MOD) f_5x5.o sp_ienv.o 
+F_DEXM	= $(F_MOD) f_pddrive.o
+F_DEXM3D= $(F_MOD) f_pddrive3d.o
+F_ZEXM	= $(F_MOD) f_pzdrive.o
+F_ZEXM3D= $(F_MOD) f_pzdrive3d.o
+F_5x5 	= $(F_MOD) f_5x5.o sp_ienv.o
 
-all: f_pddrive f_pzdrive f_5x5
 
-f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(F_5x5) $(C_DWRAP) $(LIBS) -o $@
+all: f_pddrive f_pddrive3d f_pzdrive f_pzdrive3d f_5x5
 
-f_pddrive: $(F_DEXM) $(C_DWRAP) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(F_DEXM) $(C_DWRAP) $(LIBS) -o $@
+f_5x5: $(F_5x5) $(DSUPERLULIB) $(DFORTRANLIB)
+	$(LOADER) $(LOADOPTS) $(F_5x5) $(LIBS) -o $@
 
-f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB)
-	$(LOADER) $(LOADOPTS) $(F_ZEXM) $(C_ZWRAP) $(LIBS) -o $@
+f_pddrive: $(F_DEXM) $(DSUPERLULIB) $(DFORTRANLIB)
+	$(LOADER) $(LOADOPTS) $(F_DEXM) $(LIBS) -o $@
+
+f_pddrive3d: $(F_DEXM3D) $(DSUPERLULIB) $(DFORTRANLIB)
+	$(LOADER) $(LOADOPTS) $(F_DEXM3D) $(LIBS) -o $@
+
+f_pzdrive: $(F_ZEXM) $(DSUPERLULIB) $(DFORTRANLIB)
+	$(LOADER) $(LOADOPTS) $(F_ZEXM) $(LIBS) -o $@
+
+f_pzdrive3d: $(F_ZEXM3D) $(DSUPERLULIB) $(DFORTRANLIB)
+	$(LOADER) $(LOADOPTS) $(F_ZEXM3D) $(LIBS) -o $@
 
 .c.o:
 	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE)
@@ -38,6 +47,9 @@ f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB)
 .f90.o:
 	$(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE)
 
+.F90.o:
+	$(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE)
+
 .f.o:
 	$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
 
diff --git a/FORTRAN/README b/FORTRAN/README
index 95187ab8..ce7da07d 100644
--- a/FORTRAN/README
+++ b/FORTRAN/README
@@ -1,4 +1,5 @@
 		Fortran 90 Interface
+		====================
 
 This directory contains Fortran-90 wrapper routines for SuperLU_DIST.
 The directory contains the following files:
@@ -13,7 +14,7 @@ To compile the code, type 'make'
 
 There are two examples in the directory.
 
-1. f_5x5.f90:
+1. f_5x5.f90
    A small 5x5 example appeared in the SuperLU Users Guide, Section 2.2.
    To run the code, type:
       mpiexec -n 2 f_5x5
@@ -24,11 +25,25 @@ There are two examples in the directory.
    'g20.rua' in Harwell-Boeing format.
    To run the code, type:
       mpiexec -n 4 f_pddrive
-   (The example is set up to use 4 processors.)
+   (The example is set up to use 4 MPI processes)
+
+2. f_pddrive3d.f90: use the 3D algorithms
+   A real example Fortran driver routine that reads a matrix from a file
+   'g20.rua' in Harwell-Boeing format.
+   To run the code, type:
+      mpiexec -n 8 f_pddrive3d
+   (The example is set up to use 8 MPI processes)
 
 3. f_pzdrive.f90
    A complex example Fortran driver routine that reads a matrix from a file
    'cg20.cua' in Harwell-Boeing format.
    To run the code, type:
       mpiexec -n 4 f_pzdrive
-   (The example is set up to use 4 processors.)
+   (The example is set up to use 4 MPI processes)
+
+3. f_pzdrive3d.f90: use the 3D algorihms
+   A complex example Fortran driver routine that reads a matrix from a file
+   'cg20.cua' in Harwell-Boeing format.
+   To run the code, type:
+      mpiexec -n 8 f_pzdrive3d
+   (The example is set up to use 8 MPI processes)
diff --git a/FORTRAN/c2f_dcreate_matrix_x_b.c b/FORTRAN/c2f_dcreate_matrix_x_b.c
new file mode 100644
index 00000000..cd9b970f
--- /dev/null
+++ b/FORTRAN/c2f_dcreate_matrix_x_b.c
@@ -0,0 +1,281 @@
+
+
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read the matrix from data file, then distribute it in a
+ * distributed CSR format.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_ddefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * C2F_DCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * fname (input) char*
+ *       File name as a character string.
+ *
+ * nrhs  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * </pre>
+ */
+
+int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+                           MPI_Comm slucomm, SuperMatrix *A,
+			   int *m_g, int *n_g, int_t *nnz_g,
+			   double *rhs, int *ldb, double *x, int *ldx)
+{
+    SuperMatrix GA;              /* global A */
+    double   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    double   *nzval;             /* global */
+    double   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t     *marker;
+    int_t  nnz, nnz_loc, m, n;
+    int    m_loc, fst_row; 
+    int    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int      row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+
+    char     **cpp, c, *postfix;;
+    FILE     *fp, *fopen();
+    
+    MPI_Comm_rank(slucomm, &iam);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter c2f_dreate_matrix_x_b()");
+#endif
+
+    if ( iam==0 ) {
+	double t = SuperLU_timer_(); 
+
+	if ( !(fp = fopen(fname, "r")) ) {
+	    ABORT("File does not exist");
+	}
+	for (i = 0; i < strlen(fname); i++) {
+	    if (fname[i]=='.') {
+		postfix = &(fname[i+1]);
+	    }
+	}
+	if(!strcmp(postfix,"rua")){
+		/* Read the matrix stored on disk in Harwell-Boeing format. */
+		dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"mtx")){
+		/* Read the matrix stored on disk in Matrix Market format. */
+		dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"rb")){
+		/* Read the matrix stored on disk in Rutherford-Boeing format. */
+		dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"dat")){
+		/* Read the matrix stored on disk in triplet format. */
+		dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"datnh")){
+		/* Read the matrix stored on disk in triplet format (without header). */
+		dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"bin")){
+		/* Read the matrix stored on disk in binary format. */
+		dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else {
+		ABORT("File format not known");
+	}
+
+        fclose(fp);
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+			
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( nzval,  nnz, MPI_DOUBLE, 0, slucomm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, slucomm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, slucomm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, slucomm );
+
+	/* Allocate storage for compressed column representation. */
+	dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, MPI_DOUBLE, 0, slucomm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, slucomm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, slucomm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / nprocs;         //(grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * nprocs) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (nprocs - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (nprocs - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_D, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doubleMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    dGenXtrue_dist(n, nrhs, xtrue_global, n);
+    dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (double *) doubleMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_D, SLU_GE);
+    
+    /* Get the local B */
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    rhs[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+    *ldx = m_loc;
+
+    /* Set the true X */    
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit c2f_dreate_matrix_x_b()");
+#endif
+
+    *m_g = m;
+    *n_g = n;
+    *nnz_g = nnz;
+    return 0;
+}
diff --git a/FORTRAN/c2f_zcreate_matrix_x_b.c b/FORTRAN/c2f_zcreate_matrix_x_b.c
new file mode 100644
index 00000000..df54a458
--- /dev/null
+++ b/FORTRAN/c2f_zcreate_matrix_x_b.c
@@ -0,0 +1,280 @@
+
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file 
+ * \brief Read the matrix from data file, then distribute it in a
+ * distributed CSR format.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_zdefs.h"
+
+/* \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ * 
+ * C2F_ZCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * fname (input) char*
+ *       File name as a character string.
+ *
+ * nrhs  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * </pre>
+ */
+
+int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+                           MPI_Comm slucomm, SuperMatrix *A,
+			   int *m_g, int *n_g, int_t *nnz_g,
+			   doublecomplex *rhs, int *ldb, doublecomplex *x, int *ldx)
+{
+    SuperMatrix GA;              /* global A */
+    doublecomplex   *b_global, *xtrue_global;  /* replicated on all processes */
+    int_t    *rowind, *colptr;	 /* global */
+    doublecomplex   *nzval;             /* global */
+    doublecomplex   *nzval_loc;         /* local */
+    int_t    *colind, *rowptr;	 /* local */
+    int_t     *marker;
+    int_t  nnz, nnz_loc, m, n;
+    int    m_loc, fst_row; 
+    int    m_loc_fst; /* Record m_loc of the first p-1 processors,
+			   when mod(m, p) is not zero. */ 
+    int      row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+
+    char     **cpp, c, *postfix;;
+    FILE     *fp, *fopen();
+    
+    MPI_Comm_rank(slucomm, &iam);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter c2f_zreate_matrix_x_b()");
+#endif
+
+    if ( iam==0 ) {
+	double t = SuperLU_timer_(); 
+
+	if ( !(fp = fopen(fname, "r")) ) {
+	    ABORT("File does not exist");
+	}
+	for (i = 0; i < strlen(fname); i++) {
+	    if (fname[i]=='.') {
+		postfix = &(fname[i+1]);
+	    }
+	}
+	if(!strcmp(postfix,"cua")){
+		/* Read the matrix stored on disk in Harwell-Boeing format. */
+		zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"mtx")){
+		/* Read the matrix stored on disk in Matrix Market format. */
+		zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"rb")){
+		/* Read the matrix stored on disk in Rutherford-Boeing format. */
+		zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"dat")){
+		/* Read the matrix stored on disk in triplet format. */
+		zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);
+	}else if(!strcmp(postfix,"datnh")){
+		/* Read the matrix stored on disk in triplet format (without header). */
+		zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else if(!strcmp(postfix,"bin")){
+		/* Read the matrix stored on disk in binary format. */
+		zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr);		
+	}else {
+		ABORT("File format not known");
+	}
+
+        fclose(fp);
+	printf("Time to read and distribute matrix %.2f\n", 
+	        SuperLU_timer_() - t);  fflush(stdout);
+			
+	/* Broadcast matrix A to the other PEs. */
+	MPI_Bcast( &m,     1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &n,     1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &nnz,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( nzval,  nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm );
+	MPI_Bcast( rowind, nnz, mpi_int_t,  0, slucomm );
+	MPI_Bcast( colptr, n+1, mpi_int_t,  0, slucomm );
+    } else {
+	/* Receive matrix A from PE 0. */
+	MPI_Bcast( &m,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &n,   1,   mpi_int_t,  0, slucomm );
+	MPI_Bcast( &nnz, 1,   mpi_int_t,  0, slucomm );
+
+	/* Allocate storage for compressed column representation. */
+	zallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+
+	MPI_Bcast( nzval,   nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm );
+	MPI_Bcast( rowind,  nnz, mpi_int_t,  0, slucomm );
+	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, slucomm );
+    }
+
+#if 0
+    nzval[0]=0.1;
+#endif
+
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / nprocs;         //(grid->nprow * grid->npcol); 
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * nprocs) != m) {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+      if (iam == (nprocs - 1)) /* last proc. gets all*/
+	  m_loc = m - m_loc * (nprocs - 1);
+    }
+
+    /* Create compressed column matrix for GA. */
+    zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
+				SLU_NC, SLU_Z, SLU_GE);
+
+    /* Generate the exact solution and compute the right-hand side. */
+    if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) )
+        ABORT("Malloc fails for b[]");
+    if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) )
+        ABORT("Malloc fails for xtrue[]");
+    *trans = 'N';
+
+    zGenXtrue_dist(n, nrhs, xtrue_global, n);
+    zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m);
+
+    /*************************************************
+     * Change GA to a local A with NR_loc format     *
+     *************************************************/
+
+    rowptr = (int_t *) intMalloc_dist(m_loc+1);
+    marker = (int_t *) intCalloc_dist(n);
+
+    /* Get counts of each row of GA */
+    for (i = 0; i < n; ++i)
+      for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]];
+    /* Set up row pointers */
+    rowptr[0] = 0;
+    fst_row = iam * m_loc_fst;
+    nnz_loc = 0;
+    for (j = 0; j < m_loc; ++j) {
+      row = fst_row + j;
+      rowptr[j+1] = rowptr[j] + marker[row];
+      marker[j] = rowptr[j];
+    }
+    nnz_loc = rowptr[m_loc];
+
+    nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc);
+    colind = (int_t *) intMalloc_dist(nnz_loc);
+
+    /* Transfer the matrix into the compressed row storage */
+    for (i = 0; i < n; ++i) {
+      for (j = colptr[i]; j < colptr[i+1]; ++j) {
+	row = rowind[j];
+	if ( (row>=fst_row) && (row<fst_row+m_loc) ) {
+	  row = row - fst_row;
+	  relpos = marker[row];
+	  colind[relpos] = i;
+	  nzval_loc[relpos] = nzval[j];
+	  ++marker[row];
+	}
+      }
+    }
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) dPrint_CompCol_Matrix_dist(&GA);
+#endif   
+
+    /* Destroy GA */
+    Destroy_CompCol_Matrix_dist(&GA);
+
+    /******************************************************/
+    /* Change GA to a local A with NR_loc format */
+    /******************************************************/
+
+    /* Set up the local A in NR_loc format */
+    zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row,
+				   nzval_loc, colind, rowptr,
+				   SLU_NR_loc, SLU_Z, SLU_GE);
+    
+    /* Get the local B */
+    for (j =0; j < nrhs; ++j) {
+	for (i = 0; i < m_loc; ++i) {
+	    row = fst_row + i;
+	    rhs[j*m_loc+i] = b_global[j*n+row];
+	}
+    }
+    *ldb = m_loc;
+    *ldx = m_loc;
+
+    /* Set the true X */    
+    /* Get the local part of xtrue_global */
+    for (j = 0; j < nrhs; ++j) {
+      for (i = 0; i < m_loc; ++i)
+	x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n];
+    }
+
+    SUPERLU_FREE(b_global);
+    SUPERLU_FREE(xtrue_global);
+    SUPERLU_FREE(marker);
+
+#if ( DEBUGlevel>=1 )
+    printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc));
+    CHECK_MALLOC(iam, "Exit c2f_zreate_matrix_x_b()");
+#endif
+
+    *m_g = m;
+    *n_g = n;
+    *nnz_g = nnz;
+    return 0;
+}
diff --git a/FORTRAN/dcreate_dist_matrix.c b/FORTRAN/dcreate_dist_matrix.c
index 3a6dde99..bf451cb9 100644
--- a/FORTRAN/dcreate_dist_matrix.c
+++ b/FORTRAN/dcreate_dist_matrix.c
@@ -118,6 +118,9 @@ int dcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
 	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
     }
 
+    if (iam==0) {printf("after broadcast: m %d, nnz %d\n", m,nnz); fflush(stdout);}
+    exit(-1);
+
 #if 0
     nzval[0]=0.1;
 #endif
diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.F90
similarity index 85%
rename from FORTRAN/f_5x5.f90
rename to FORTRAN/f_5x5.F90
index fec77adc..7585da3a 100644
--- a/FORTRAN/f_5x5.f90
+++ b/FORTRAN/f_5x5.F90
@@ -33,15 +33,19 @@ program f_5x5
 !   6. Release the process grid and terminate the MPI environment
 !   7. Release all structures
 !
+#include "superlu_dist_config.fh"
       use superlu_mod
-!      implicit none
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
       parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 )
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 colind(maxnz), rowptr(maxn+1)
+#else
       integer colind(maxnz), rowptr(maxn+1)
+#endif
       real*8  nzval(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nrhs, ldb, nprow, npcol, init
-      integer*4 iam, info, i, ierr, ldb4
+      integer n, m, nnz, nrhs, nprow, npcol, init
+      integer iam, info, i, ierr, ldb
       integer nnz_loc, m_loc, fst_row
       real*8  s, u, p, e, r, l
 
@@ -62,9 +66,9 @@ program f_5x5
 ! Create Fortran handles for the C structures used in SuperLU_DIST
       call f_create_gridinfo_handle(grid)
       call f_create_options_handle(options)
-      call f_create_ScalePerm_handle(ScalePermstruct)
-      call f_create_LUstruct_handle(LUstruct)
-      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_dcreate_ScalePerm_handle(ScalePermstruct)
+      call f_dcreate_LUstruct_handle(LUstruct)
+      call f_dcreate_SOLVEstruct_handle(SOLVEstruct)
       call f_create_SuperMatrix_handle(A)
       call f_create_SuperLUStat_handle(stat)
 
@@ -81,6 +85,9 @@ program f_5x5
       if ( iam == 0 ) then 
          write(*,*) ' Process grid ', nprow, ' X ', npcol
          write(*,*) ' default integer size ', kind(0) 
+#if (XSDK_INDEX_SIZE==64)
+         write(*,*) ' use 64-bit integer for A matrix'
+#endif 
       endif
 !
 !*************************************************************************
@@ -166,7 +173,6 @@ program f_5x5
          b(i) = 1.0
       enddo
       nrhs = 1
-      ldb4 = ldb
 
 ! Set the default input options
       call f_set_default_options(options)
@@ -177,14 +183,14 @@ program f_5x5
 
 ! Initialize ScalePermstruct and LUstruct
       call get_SuperMatrix(A,nrow=m,ncol=n)
-      call f_ScalePermstructInit(m, n, ScalePermstruct)
-      call f_LUstructInit(m, n, LUstruct)
+      call f_dScalePermstructInit(m, n, ScalePermstruct)
+      call f_dLUstructInit(m, n, LUstruct)
 
 ! Initialize the statistics variables
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0 .and. iam == 1) then
@@ -196,13 +202,13 @@ program f_5x5
 ! Deallocate the storage allocated by SuperLU_DIST
       call f_PStatFree(stat)
       call f_Destroy_SuperMat_Store_dist(A)
-      call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_dSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_dScalePermstructFree(ScalePermstruct)
+      call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
+!      call f_LUstructFree(LUstruct)
+!      call get_superlu_options(options, SolveInitialized=init)
+!      if (init == YES) then
+!         call f_dSolveFinalize(options, SOLVEstruct)
+!      endif
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
@@ -213,7 +219,11 @@ program f_5x5
       call f_destroy_ScalePerm_handle(ScalePermstruct)
       call f_destroy_LUstruct_handle(LUstruct)
       call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+
+!      call f_Destroy_CompRowLoc_Mat_dist(A)
+! need to free the supermatrix Store
       call f_destroy_SuperMatrix_handle(A)
+
       call f_destroy_SuperLUStat_handle(stat)
 
 ! Check malloc
diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.F90
similarity index 67%
rename from FORTRAN/f_pddrive.f90
rename to FORTRAN/f_pddrive.F90
index 33803d99..c69792af 100644
--- a/FORTRAN/f_pddrive.f90
+++ b/FORTRAN/f_pddrive.F90
@@ -29,15 +29,20 @@ program f_pddrive
 !   7. Release all structures
 !
 !
+#include "superlu_dist_config.fh"
       use superlu_mod
-!      implicit none
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
       parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
-      integer rowind(maxnz), colptr(maxn)
-      real*8  values(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nprow, npcol, ldb, init
-      integer*4 iam, info, i, ierr, ldb4, nrhs
+      real*8  values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn)
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 nnz
+#else
+      integer nnz
+#endif
+      integer n, m, nprow, npcol
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
 
       integer(superlu_ptr) :: grid
       integer(superlu_ptr) :: options
@@ -56,9 +61,9 @@ program f_pddrive
 ! Create Fortran handles for the C structures used in SuperLU_DIST
       call f_create_gridinfo_handle(grid)
       call f_create_options_handle(options)
-      call f_create_ScalePerm_handle(ScalePermstruct)
-      call f_create_LUstruct_handle(LUstruct)
-      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_dcreate_ScalePerm_handle(ScalePermstruct)
+      call f_dcreate_LUstruct_handle(LUstruct)
+      call f_dcreate_SOLVEstruct_handle(SOLVEstruct)
       call f_create_SuperMatrix_handle(A)
       call f_create_SuperLUStat_handle(stat)
 
@@ -76,32 +81,16 @@ program f_pddrive
          write(*,*) ' Process grid ', nprow, ' X ', npcol
       endif
 
-! Read Harwell-Boeing matrix, and adjust the pointers and indices
-! to 0-based indexing, as required by C routines.
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/g20.rua'//char(0)  !! make the string null-ended
+      call  f_dcreate_matrix_x_b(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
       if ( iam == 0 ) then 
-         open(file = "../EXAMPLE/g20.rua", status = "old", unit = 5)
-         call dhbcode1(m, n, nnz, values, rowind, colptr)
-         close(unit = 5)
-!
-         do i = 1, n+1
-            colptr(i) = colptr(i) - 1
-         enddo
-         do i = 1, nnz
-            rowind(i) = rowind(i) - 1
-         enddo
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
       endif
 
-! Distribute the matrix to the process gird
-      call  f_dcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
-
-! Setup the right hand side
-      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
-      do i = 1, ldb
-         b(i) = 1.0
-      enddo
-      nrhs = 1
-      ldb4 = ldb
-
 ! Set the default input options
       call f_set_default_options(options)
 
@@ -111,18 +100,20 @@ program f_pddrive
 
 ! Initialize ScalePermstruct and LUstruct
       call get_SuperMatrix(A, nrow=m, ncol=n)
-      call f_ScalePermstructInit(m, n, ScalePermstruct)
-      call f_LUstructInit(m, n, LUstruct)
+      call f_dScalePermstructInit(m, n, ScalePermstruct)
+      call f_dLUstructInit(m, n, LUstruct)
 
 ! Initialize the statistics variables
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0) then
-         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         if ( iam == 0 ) then
+     	     write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+	 endif
       else
          write(*,*) 'INFO from f_pdgssvx = ', info
       endif
@@ -130,13 +121,8 @@ program f_pddrive
 ! Deallocate the storage allocated by SuperLU_DIST
       call f_PStatFree(stat)
       call f_Destroy_CompRowLoc_Mat_dist(A)
-      call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_dSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_dScalePermstructFree(ScalePermstruct)
+      call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
diff --git a/FORTRAN/f_pddrive3d.F90 b/FORTRAN/f_pddrive3d.F90
new file mode 100644
index 00000000..b29eeb7d
--- /dev/null
+++ b/FORTRAN/f_pddrive3d.F90
@@ -0,0 +1,163 @@
+
+
+!> @file
+! Copyright (c) 2003, The Regents of the University of California, through
+! Lawrence Berkeley National Laboratory (subject to receipt of any required 
+! approvals from U.S. Dept. of Energy) 
+! 
+! All rights reserved. 
+! 
+! The source code is distributed under BSD license, see the file License.txt
+! at the top-level directory.
+! 
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! May 12, 2021
+!! </pre>
+!
+      program f_pddrive3d
+! 
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE3D.
+!
+! This example illustrates how to use F_PDGSSVX3D with the full
+! (default) options to solve a linear system.
+! 
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pdgssvx3d
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+! The program may be run by typing
+!    mpiexec -np 8 f_pddrive3d 
+!
+#include "superlu_dist_config.fh"
+      use superlu_mod
+!      implicit none
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      real*8  values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn)
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 nnz
+#else
+      integer nnz
+#endif
+      integer n, m, nprow, npcol, npdep, init
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
+
+      integer(superlu_ptr) :: grid         ! 3D process grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A            ! A is on all 3D processes
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo3d_handle(grid)
+      call f_create_options_handle(options)
+      call f_dcreate_ScalePerm_handle(ScalePermstruct)
+      call f_dcreate_LUstruct_handle(LUstruct)
+      call f_dcreate_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 2
+      npcol = 2
+      npdep = 2
+      call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam, npdep=npdep)
+      if ( iam >= (nprow * npcol * npdep) ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep
+      endif
+
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/g20.rua'//char(0)  !! make the string null-ended
+      call  f_dcreate_matrix_x_b_3d(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
+      if ( iam == 0 ) then 
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
+      endif
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Change one or more options
+!      call set_superlu_options(options,Fact=FACTORED)
+!      call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A, nrow=m, ncol=n)
+      call f_dScalePermstructInit(m, n, ScalePermstruct)
+      call f_dLUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_pdgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0) then
+         if ( iam == 0 ) then
+            write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         endif
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_CompRowLoc_Mat_dist(A)
+      call f_dScalePermstructFree(ScalePermstruct)
+      call f_dDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct)
+      
+      call f_dDestroy_A3d_gathered_on_2d(SOLVEstruct, grid)
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/f_psdrive.F90 b/FORTRAN/f_psdrive.F90
new file mode 100644
index 00000000..e7014834
--- /dev/null
+++ b/FORTRAN/f_psdrive.F90
@@ -0,0 +1,146 @@
+
+
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!! </pre>
+!
+      program f_psdrive
+! 
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE.
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+! 
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pdgssvx
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+!
+      #include "superlu_dist_config.fh"
+      use superlu_mod
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 nnz
+#else
+      integer nnz
+#endif
+      integer n, m, nprow, npcol
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
+
+      integer(superlu_ptr) :: grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo_handle(grid)
+      call f_create_options_handle(options)
+      call f_screate_ScalePerm_handle(ScalePermstruct)
+      call f_screate_LUstruct_handle(LUstruct)
+      call f_screate_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 2
+      npcol = 2
+      call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam)
+      if ( iam >= nprow * npcol ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid ', nprow, ' X ', npcol
+      endif
+
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/g20.rua'//char(0)  !! make the string null-ended
+      call  f_screate_matrix_x_b(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
+      if ( iam == 0 ) then 
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
+      endif
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Change one or more options
+!      call set_superlu_options(options,Fact=FACTORED)
+!      call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A, nrow=m, ncol=n)
+      call f_sScalePermstructInit(m, n, ScalePermstruct)
+      call f_sLUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_psgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0) then
+         if ( iam == 0 ) then
+     	     write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+	 endif
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_CompRowLoc_Mat_dist(A)
+      call f_sScalePermstructFree(ScalePermstruct)
+      call f_sDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.F90
similarity index 67%
rename from FORTRAN/f_pzdrive.f90
rename to FORTRAN/f_pzdrive.F90
index 9c9db5b0..f493f1d5 100644
--- a/FORTRAN/f_pzdrive.f90
+++ b/FORTRAN/f_pzdrive.F90
@@ -28,15 +28,21 @@ program f_pzdrive
 !   7. Release all structures
 !
 !
+#include "superlu_dist_config.fh"
       use superlu_mod
-!      implicit none
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
       parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
-      integer rowind(maxnz), colptr(maxn)
-      double complex  values(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nprow, npcol, ldb, init
-      integer*4 iam, info, i, ierr, ldb4, nrhs
+      double complex  values(maxnz), b(maxn), xtrue(maxn)
+      real*8 berr(maxnrhs)
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 nnz
+#else
+      integer nnz
+#endif
+      integer n, m, nprow, npcol
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
 
       integer(superlu_ptr) :: grid
       integer(superlu_ptr) :: options
@@ -55,9 +61,9 @@ program f_pzdrive
 ! Create Fortran handles for the C structures used in SuperLU_DIST
       call f_create_gridinfo_handle(grid)
       call f_create_options_handle(options)
-      call f_create_ScalePerm_handle(ScalePermstruct)
-      call f_create_LUstruct_handle(LUstruct)
-      call f_create_SOLVEstruct_handle(SOLVEstruct)
+      call f_zcreate_ScalePerm_handle(ScalePermstruct)
+      call f_zcreate_LUstruct_handle(LUstruct)
+      call f_zcreate_SOLVEstruct_handle(SOLVEstruct)
       call f_create_SuperMatrix_handle(A)
       call f_create_SuperLUStat_handle(stat)
 
@@ -75,32 +81,16 @@ program f_pzdrive
          write(*,*) ' Process grid ', nprow, ' X ', npcol
       endif
 
-! Read Harwell-Boeing matrix, and adjust the pointers and indices
-! to 0-based indexing, as required by C routines.
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/cg20.cua'//char(0)  !! make the string null-ended
+      call  f_zcreate_matrix_x_b(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
       if ( iam == 0 ) then 
-         open(file = "../EXAMPLE/cg20.cua", status = "old", unit = 5)
-         call zhbcode1(m, n, nnz, values, rowind, colptr)
-         close(unit = 5)
-!
-         do i = 1, n+1
-            colptr(i) = colptr(i) - 1
-         enddo
-         do i = 1, nnz
-            rowind(i) = rowind(i) - 1
-         enddo
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
       endif
 
-! Distribute the matrix to the process gird
-      call  f_zcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
-
-! Setup the right hand side
-      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
-      do i = 1, ldb
-         b(i) = 1.0
-      enddo
-      nrhs = 1
-      ldb4 = ldb
-
 ! Set the default input options
       call f_set_default_options(options)
 
@@ -110,18 +100,20 @@ program f_pzdrive
 
 ! Initialize ScalePermstruct and LUstruct
       call get_SuperMatrix(A, nrow=m, ncol=n)
-      call f_ScalePermstructInit(m, n, ScalePermstruct)
-      call f_LUstructInit(m, n, LUstruct)
+      call f_zScalePermstructInit(m, n, ScalePermstruct)
+      call f_zLUstructInit(m, n, LUstruct)
 
 ! Initialize the statistics variables
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pzgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pzgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0) then
-         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         if ( iam == 0 ) then
+     	     write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+	 endif
       else
          write(*,*) 'INFO from f_pdgssvx = ', info
       endif
@@ -129,13 +121,8 @@ program f_pzdrive
 ! Deallocate the storage allocated by SuperLU_DIST
       call f_PStatFree(stat)
       call f_Destroy_CompRowLoc_Mat_dist(A)
-      call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_zSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_zScalePermstructFree(ScalePermstruct)
+      call f_zDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
diff --git a/FORTRAN/f_pzdrive3d.F90 b/FORTRAN/f_pzdrive3d.F90
new file mode 100644
index 00000000..a299d7b3
--- /dev/null
+++ b/FORTRAN/f_pzdrive3d.F90
@@ -0,0 +1,163 @@
+
+!> @file
+! Copyright (c) 2003, The Regents of the University of California, through
+! Lawrence Berkeley National Laboratory (subject to receipt of any required 
+! approvals from U.S. Dept. of Energy) 
+! 
+! All rights reserved. 
+! 
+! The source code is distributed under BSD license, see the file License.txt
+! at the top-level directory.
+! 
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! May 12, 2021
+!! </pre>
+!
+      program f_pzdrive3d
+! 
+! Purpose
+! =======
+!
+! The driver program F_PZDRIVE3D.
+!
+! This example illustrates how to use F_PZGSSVX3D with the full
+! (default) options to solve a linear system.
+! 
+! Seven basic steps are required:
+!   1. Create C structures used in SuperLU_DIST
+!   2. Initialize the MPI environment and the SuperLU process grid
+!   3. Set up the input matrix and the right-hand side
+!   4. Set the options argument
+!   5. Call f_pzgssvx3d
+!   6. Release the process grid and terminate the MPI environment
+!   7. Release all structures
+!
+! The program may be run by typing
+!    mpiexec -np 8 f_pzdrive3d 
+!
+#include "superlu_dist_config.fh"
+      use superlu_mod
+!      implicit none
+      include 'mpif.h'
+      integer maxn, maxnz, maxnrhs
+      parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+      double complex  values(maxnz), b(maxn), xtrue(maxn)
+      real*8 berr(maxnrhs)
+#if (XSDK_INDEX_SIZE==64)
+      integer*8 nnz
+#else
+      integer nnz
+#endif
+      integer n, m, nprow, npcol, npdep, init
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
+
+      integer(superlu_ptr) :: grid         ! 3D process grid
+      integer(superlu_ptr) :: options
+      integer(superlu_ptr) :: ScalePermstruct
+      integer(superlu_ptr) :: LUstruct
+      integer(superlu_ptr) :: SOLVEstruct
+      integer(superlu_ptr) :: A            ! A is on all 3D processes
+      integer(superlu_ptr) :: stat
+
+! Initialize MPI environment 
+      call mpi_init(ierr)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+      call f_create_gridinfo3d_handle(grid)
+      call f_create_options_handle(options)
+      call f_zcreate_ScalePerm_handle(ScalePermstruct)
+      call f_zcreate_LUstruct_handle(LUstruct)
+      call f_zcreate_SOLVEstruct_handle(SOLVEstruct)
+      call f_create_SuperMatrix_handle(A)
+      call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+      nprow = 2
+      npcol = 2
+      npdep = 2
+      call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid)
+
+! Bail out if I do not belong in the grid. 
+      call get_GridInfo(grid, iam=iam, npdep=npdep)
+      if ( iam >= (nprow * npcol * npdep) ) then 
+         go to 100
+      endif
+      if ( iam == 0 ) then 
+         write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep
+      endif
+
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/cg20.cua'//char(0)  !! make the string null-ended
+      call  f_zcreate_matrix_x_b_3d(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
+      if ( iam == 0 ) then 
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
+      endif
+
+! Set the default input options
+      call f_set_default_options(options)
+
+! Change one or more options
+!      call set_superlu_options(options,Fact=FACTORED)
+!      call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+      call get_SuperMatrix(A, nrow=m, ncol=n)
+      call f_zScalePermstructInit(m, n, ScalePermstruct)
+      call f_zLUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+      call f_PStatInit(stat)
+
+! Call the linear equation solver
+      call f_pzgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, &
+                     grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+      if (info == 0) then
+         if ( iam == 0 ) then
+            write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         endif
+      else
+         write(*,*) 'INFO from f_pdgssvx = ', info
+      endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+      call f_PStatFree(stat)
+      call f_Destroy_CompRowLoc_Mat_dist(A)
+      call f_zScalePermstructFree(ScalePermstruct)
+      call f_zDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct)
+      
+      call f_zDestroy_A3d_gathered_on_2d(SOLVEstruct, grid)
+
+! Release the SuperLU process grid
+100   call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+      call f_destroy_gridinfo_handle(grid)
+      call f_destroy_options_handle(options)
+      call f_destroy_ScalePerm_handle(ScalePermstruct)
+      call f_destroy_LUstruct_handle(LUstruct)
+      call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+      call f_destroy_SuperMatrix_handle(A)
+      call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+!      call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+      call mpi_finalize(ierr)
+
+      stop
+      end
diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c
index 3366671a..e62d5c91 100644
--- a/FORTRAN/sp_ienv.c
+++ b/FORTRAN/sp_ienv.c
@@ -53,6 +53,8 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+            = 8: the maximum buffer size on GPU that can hold the three
+	         matrices in the GEMM call for the Schur complement update.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -62,13 +64,11 @@ at the top-level directory.
 </pre>
 */
 
-
 #include <stdlib.h>
 #include <stdio.h>
 
-
-int_t
-sp_ienv_dist(int_t ispec)
+int
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
@@ -94,21 +94,27 @@ sp_ienv_dist(int_t ispec)
             return 1;
             
 	case 3: 
-            ttemp = getenv("NSUP");
+	    ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h
             if(ttemp)
             {
-                return(atoi(ttemp));
+		int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE );
+                return (k);
             }
-            else
-            return 128;
+            else return 128;
 
 #endif
-        case 6: return (5);
+        case 6: 
+            ttemp = getenv("FILL");
+            if ( ttemp ) return(atoi(ttemp));
+            else return (5);
         case 7:
 	    ttemp = getenv ("N_GEMM");
 	    if (ttemp) return atoi (ttemp);
 	    else return 10000;
-
+        case 8:
+  	    ttemp = getenv ("MAX_BUFFER_SIZE");
+	    if (ttemp) return atoi (ttemp);
+	    else return 64000000; // 8000^2
     }
 
     /* Invalid value for ISPEC */
@@ -116,6 +122,5 @@ sp_ienv_dist(int_t ispec)
     xerr_dist("sp_ienv", &i);
     return 0;
 
-
 } /* sp_ienv_dist */
 
diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c
index da0e5de7..fb5122e8 100644
--- a/FORTRAN/superlu_c2f_dwrap.c
+++ b/FORTRAN/superlu_c2f_dwrap.c
@@ -4,287 +4,133 @@
  * \brief C interface functions for the Fortran90 wrapper.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
 #include "superlu_FCnames.h"
 
-/* kind of integer to hold a pointer.  Use int.
-   This might need to be changed on systems with large memory.
-   If changed, be sure to change it in superlupara.f90 too */
 
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
 #if 0
 typedef int fptr;  /* 32-bit */
 #else
 typedef long long int fptr;  /* 64-bit */
 #endif
 
-
-/* some MPI implementations may require conversion between a Fortran
-   communicator and a C communicator.  This routine is used to perform the
-   conversion.  It may need different forms for different MPI libraries. */
-
-/* NO_MPI2 should be defined on the compiler command line if the MPI
-   library does not provide MPI_Comm_f2c */
-
-MPI_Comm f2c_comm(int *f_comm)
-{
-#ifndef NO_MPI2
-
-/* MPI 2 provides a standard way of doing this */
-   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
-#else
-
-/* will probably need some special cases here */
-/* when in doubt, just return the input */
-   return (MPI_Comm)(*f_comm);
-#endif
-}
-
-
 /* functions that create memory for a struct and return a handle */
 
-void f_create_gridinfo_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
-}
-
-void f_create_options_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
-}
-
-void f_create_ScalePerm_handle(fptr *handle)
+void f_dcreate_ScalePerm_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dScalePermstruct_t));
 }
 
-void f_create_LUstruct_handle(fptr *handle)
+void f_dcreate_LUstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dLUstruct_t));
 }
 
-void f_create_SOLVEstruct_handle(fptr *handle)
+void f_dcreate_SOLVEstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dSOLVEstruct_t));
 }
 
-void f_create_SuperMatrix_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
-}
-
-void f_create_SuperLUStat_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
-}
-
-/* functions that free the memory allocated by the above functions */
-
-void f_destroy_gridinfo_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_options_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_ScalePerm_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_LUstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SOLVEstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperMatrix_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperLUStat_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-/* functions that get or set values in a C struct.
-   This is not the complete set of structs for which a user might want
-   to get/set a component, and there may be missing components. */
-
-void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
-{
-  *iam=((gridinfo_t *) *grid)->iam;
-  *npcol=((gridinfo_t *) *grid)->npcol;
-  *nprow=((gridinfo_t *) *grid)->nprow;
-}
-
-void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   *nrow = ((SuperMatrix *) *A)->nrow;
-   *ncol = ((SuperMatrix *) *A)->ncol;
-}
-
-void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   ((SuperMatrix *) *A)->nrow = *nrow;
-   ((SuperMatrix *) *A)->ncol = *ncol;
-}
-
-void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  *m=((SuperMatrix *) *A)->nrow;
-  *n=((SuperMatrix *) *A)->ncol;
-  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
-  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
-  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
-}
-
-void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  ((SuperMatrix *) *A)->nrow = *m;
-  ((SuperMatrix *) *A)->ncol = *n;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
-}
-
-void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
-   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
-   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
-   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
-   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
-   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
-   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
-   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
-   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
-   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
-   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
-}
-
-void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
-    l_options->Fact = (fact_t) *Fact;
-   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
-   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
-   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
-   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
-   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
-   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
-   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
-   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
-   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
-   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
-}
-
 /* wrappers for SuperLU functions */
 
-void f_set_default_options(fptr *options)
-{
-   set_default_options_dist((superlu_dist_options_t *) *options);
-}
-
-void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
-{
-  
-   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
-{
-   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridexit(fptr *grid)
-{
-   superlu_gridexit((gridinfo_t *) *grid);
-}
-
-void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+void f_dScalePermstructInit(int *m, int *n, fptr *ScalePermstruct)
 {
    dScalePermstructInit(*m, *n, (dScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_ScalePermstructFree(fptr *ScalePermstruct)
+void f_dScalePermstructFree(fptr *ScalePermstruct)
 {
    dScalePermstructFree((dScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_PStatInit(fptr *stat)
-{
-   PStatInit((SuperLUStat_t *) *stat);
-}
-
-void f_PStatFree(fptr *stat)
-{
-   PStatFree((SuperLUStat_t *) *stat);
-}
-
-void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+void f_dLUstructInit(int *m, int *n, fptr *LUstruct)
 {
    extern void dLUstructInit(const int_t, dLUstruct_t *);
 
    dLUstructInit(*m, (dLUstruct_t *) *LUstruct);
 }
 
-void f_LUstructFree(fptr *LUstruct)
+void f_dLUstructFree(fptr *LUstruct)
 {
    extern void dLUstructFree(dLUstruct_t *);
 
    dLUstructFree((dLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_dDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    dLUstructFree((dLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+    }
 }
 
-void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-				   int_t *m_loc, int_t *fst_row, double *nzval,
-				   int_t *colind, int_t *rowptr, int *stype,
-				   int *dtype, int *mtype)
+void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
 {
-   dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
-                                  *fst_row, (double *) nzval, colind, rowptr,
-                                  (Stype_t) *stype, (Dtype_t) *dtype,
-                                  (Mtype_t) *mtype);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dLUstruct_t *LUstruct_ptr = (dLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	dDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+    	dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+    } else { // process layers not equal 0
+        dDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        dDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    dLUstructFree(LUstruct_ptr);
 }
 
-void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+void f_dDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
-   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+    dDestroy_A3d_gathered_on_2d((dSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
 }
 
-void f_Destroy_SuperMat_Store_dist(fptr *A)
+
+void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
+				   int *m_loc, int *fst_row, double *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
 {
-   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+#if 1
+    double *C_nzval = nzval;
+    int_t *C_colind = colind;
+    int_t *C_rowptr = rowptr;
+#else
+    /* make a copy of matrix A that is internal to the C side */
+    double *C_nzval = doubleMalloc_dist(*nnz_loc);
+    int_t *C_colind = intMalloc_dist(*nnz_loc);
+    int_t *C_rowptr = intMalloc_dist(*m_loc + 1);
+    int i;
+    
+    for (i = 0; i < *nnz_loc; ++i) {
+        C_nzval[i] = nzval[i];
+        C_colind[i] = colind[i];
+    }
+    for (i = 0; i <= *m_loc; ++i) {
+        C_rowptr[i] = rowptr[i];
+    }
+#endif
+
+    dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, C_nzval, C_colind, C_rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
 }
 
 void f_dSolveFinalize(fptr *options, fptr *SOLVEstruct)
@@ -307,26 +153,64 @@ void f_pdgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, double *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_dcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   double *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pdgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 double *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int dcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, double *,
-			   int_t * , int_t *, gridinfo_t *);
-
-   dcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (double *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pdgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (dScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (dLUstruct_t *) *LUstruct,
+	      (dSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
 
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
 }
 
-/* Check malloc */
+/* Create the distributed matrix */
 
-void f_check_malloc(int *iam)
-{
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC((int_t) *iam, "Check Malloc");
-#endif
+void f_dcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+    extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol);
+
+    int iam, nprocs;
+    int nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_dcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+    extern void f_get_gridinfo3d(fptr *grid, int *iam,
+                                 int *nprow, int *npcol, int *npdep);
+
+    int iam, nprocs;
+    int nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
+
diff --git a/FORTRAN/superlu_c2f_wrap.c b/FORTRAN/superlu_c2f_wrap.c
new file mode 100644
index 00000000..1c4b0e8e
--- /dev/null
+++ b/FORTRAN/superlu_c2f_wrap.c
@@ -0,0 +1,262 @@
+
+
+/*! @file 
+ * \brief C interface functions for the Fortran90 wrapper.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ * April 5, 2015
+ * May 12, 2021
+ */
+
+#include "superlu_defs.h"
+#include "superlu_FCnames.h"
+
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
+#if 0
+typedef int fptr;  /* 32-bit */
+#else
+typedef long long int fptr;  /* 64-bit */
+#endif
+
+
+/* some MPI implementations may require conversion between a Fortran
+   communicator and a C communicator.  This routine is used to perform the
+   conversion.  It may need different forms for different MPI libraries. */
+
+/* NO_MPI2 should be defined on the compiler command line if the MPI
+   library does not provide MPI_Comm_f2c */
+
+MPI_Comm f2c_comm(int *f_comm)
+{
+#ifndef NO_MPI2
+
+/* MPI 2 provides a standard way of doing this */
+   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
+#else
+
+/* will probably need some special cases here */
+/* when in doubt, just return the input */
+   return (MPI_Comm)(*f_comm);
+#endif
+}
+
+
+/* functions that create memory for a struct and return a handle */
+
+void f_create_gridinfo_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
+}
+
+void f_create_gridinfo3d_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t));
+}
+
+void f_create_options_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
+}
+
+void f_create_SuperMatrix_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
+}
+
+void f_create_SuperLUStat_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+}
+
+/* functions that free the memory allocated by the above functions */
+
+void f_destroy_gridinfo_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_options_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_ScalePerm_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_LUstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SOLVEstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperMatrix_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperLUStat_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+/* functions that get or set values in a C struct.
+   This is not the complete set of structs for which a user might want
+   to get/set a component, and there may be missing components. */
+
+void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol)
+{
+  *iam=((gridinfo_t *) *grid)->iam;
+  *npcol=((gridinfo_t *) *grid)->npcol;
+  *nprow=((gridinfo_t *) *grid)->nprow;
+}
+
+void f_get_gridinfo3d(fptr *grid, int *iam,
+         	      int *nprow, int *npcol, int *npdep)
+{
+  *iam=((gridinfo3d_t *) *grid)->iam;
+  *npcol=((gridinfo3d_t *) *grid)->npcol;
+  *nprow=((gridinfo3d_t *) *grid)->nprow;
+  *npdep=((gridinfo3d_t *) *grid)->npdep;
+}
+
+void f_get_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   *nrow = ((SuperMatrix *) *A)->nrow;
+   *ncol = ((SuperMatrix *) *A)->ncol;
+}
+
+void f_set_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   ((SuperMatrix *) *A)->nrow = *nrow;
+   ((SuperMatrix *) *A)->ncol = *ncol;
+}
+
+void f_get_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  *m=((SuperMatrix *) *A)->nrow;
+  *n=((SuperMatrix *) *A)->ncol;
+  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
+  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
+  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
+}
+
+void f_set_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  NRformat_loc *Astore = ((SuperMatrix *) *A)->Store;
+
+  ((SuperMatrix *) *A)->nrow = *m;
+  ((SuperMatrix *) *A)->ncol = *n;
+  Astore->m_loc = *m_loc;
+  Astore->nnz_loc = *nnz_loc;
+  Astore->fst_row = *fst_row;
+}
+
+void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
+   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
+   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
+   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
+   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
+   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
+   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
+   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
+   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
+   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
+   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
+}
+
+void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
+    l_options->Fact = (fact_t) *Fact;
+   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
+   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
+   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
+   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
+   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
+   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
+   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
+   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
+   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
+   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
+}
+
+/* wrappers for SuperLU functions */
+
+void f_set_default_options(fptr *options)
+{
+   set_default_options_dist((superlu_dist_options_t *) *options);
+}
+
+void f_superlu_gridinit(int *Bcomm, int *nprow, int *npcol, fptr *grid)
+{
+   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridinit3d(int *Bcomm, int *nprow, int *npcol,
+   			  int *npdep, fptr *grid)
+{
+    superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid);
+}
+
+void f_superlu_gridmap(int *Bcomm, int *nprow, int *npcol, 
+                       int *usermap, int *ldumap, fptr *grid)
+{
+   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap,
+		   (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridexit(fptr *grid)
+{
+   superlu_gridexit((gridinfo_t *) *grid);
+}
+
+void f_PStatInit(fptr *stat)
+{
+   PStatInit((SuperLUStat_t *) *stat);
+}
+
+void f_PStatFree(fptr *stat)
+{
+   PStatFree((SuperLUStat_t *) *stat);
+}
+
+void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+{
+   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+}
+
+void f_Destroy_SuperMat_Store_dist(fptr *A)
+{
+   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+}
+
+/* Check malloc */
+
+void f_check_malloc(int *iam)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(*iam, "Check Malloc");
+#endif
+}
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
index ee963113..70f66607 100644
--- a/FORTRAN/superlu_c2f_zwrap.c
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -3,287 +3,133 @@
  * \brief C interface functions for the Fortran90 wrapper.
  *
  * <pre>
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
 #include "superlu_FCnames.h"
 
-/* kind of integer to hold a pointer.  Use int.
-   This might need to be changed on systems with large memory.
-   If changed, be sure to change it in superlupara.f90 too */
 
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
 #if 0
 typedef int fptr;  /* 32-bit */
 #else
 typedef long long int fptr;  /* 64-bit */
 #endif
 
-
-/* some MPI implementations may require conversion between a Fortran
-   communicator and a C communicator.  This routine is used to perform the
-   conversion.  It may need different forms for different MPI libraries. */
-
-/* NO_MPI2 should be defined on the compiler command line if the MPI
-   library does not provide MPI_Comm_f2c */
-
-MPI_Comm f2c_comm(int *f_comm)
-{
-#ifndef NO_MPI2
-
-/* MPI 2 provides a standard way of doing this */
-   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
-#else
-
-/* will probably need some special cases here */
-/* when in doubt, just return the input */
-   return (MPI_Comm)(*f_comm);
-#endif
-}
-
-
 /* functions that create memory for a struct and return a handle */
 
-void f_create_gridinfo_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
-}
-
-void f_create_options_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
-}
-
-void f_create_ScalePerm_handle(fptr *handle)
+void f_zcreate_ScalePerm_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zScalePermstruct_t));
 }
 
-void f_create_LUstruct_handle(fptr *handle)
+void f_zcreate_LUstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zLUstruct_t));
 }
 
-void f_create_SOLVEstruct_handle(fptr *handle)
+void f_zcreate_SOLVEstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zSOLVEstruct_t));
 }
 
-void f_create_SuperMatrix_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
-}
-
-void f_create_SuperLUStat_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
-}
-
-/* functions that free the memory allocated by the above functions */
-
-void f_destroy_gridinfo_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_options_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_ScalePerm_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_LUstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SOLVEstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperMatrix_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperLUStat_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-/* functions that get or set values in a C struct.
-   This is not the complete set of structs for which a user might want
-   to get/set a component, and there may be missing components. */
-
-void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
-{
-  *iam=((gridinfo_t *) *grid)->iam;
-  *npcol=((gridinfo_t *) *grid)->npcol;
-  *nprow=((gridinfo_t *) *grid)->nprow;
-}
-
-void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   *nrow = ((SuperMatrix *) *A)->nrow;
-   *ncol = ((SuperMatrix *) *A)->ncol;
-}
-
-void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   ((SuperMatrix *) *A)->nrow = *nrow;
-   ((SuperMatrix *) *A)->ncol = *ncol;
-}
-
-void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  *m=((SuperMatrix *) *A)->nrow;
-  *n=((SuperMatrix *) *A)->ncol;
-  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
-  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
-  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
-}
-
-void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  ((SuperMatrix *) *A)->nrow = *m;
-  ((SuperMatrix *) *A)->ncol = *n;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
-}
-
-void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
-   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
-   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
-   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
-   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
-   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
-   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
-   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
-   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
-   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
-   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
-}
-
-void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
-    l_options->Fact = (fact_t) *Fact;
-   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
-   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
-   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
-   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
-   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
-   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
-   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
-   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
-   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
-   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
-}
-
 /* wrappers for SuperLU functions */
 
-void f_set_default_options(fptr *options)
-{
-   set_default_options_dist((superlu_dist_options_t *) *options);
-}
-
-void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
-{
-  
-   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
-{
-   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridexit(fptr *grid)
-{
-   superlu_gridexit((gridinfo_t *) *grid);
-}
-
-void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+void f_zScalePermstructInit(int *m, int *n, fptr *ScalePermstruct)
 {
    zScalePermstructInit(*m, *n, (zScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_ScalePermstructFree(fptr *ScalePermstruct)
+void f_zScalePermstructFree(fptr *ScalePermstruct)
 {
    zScalePermstructFree((zScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_PStatInit(fptr *stat)
-{
-   PStatInit((SuperLUStat_t *) *stat);
-}
-
-void f_PStatFree(fptr *stat)
-{
-   PStatFree((SuperLUStat_t *) *stat);
-}
-
-void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+void f_zLUstructInit(int *m, int *n, fptr *LUstruct)
 {
    extern void zLUstructInit(const int_t, zLUstruct_t *);
 
    zLUstructInit(*m, (zLUstruct_t *) *LUstruct);
 }
 
-void f_LUstructFree(fptr *LUstruct)
+void f_zLUstructFree(fptr *LUstruct)
 {
    extern void zLUstructFree(zLUstruct_t *);
 
    zLUstructFree((zLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_zDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    zLUstructFree((zLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+    }
 }
 
-void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-				   int_t *m_loc, int_t *fst_row, doublecomplex *nzval,
-				   int_t *colind, int_t *rowptr, int *stype,
-				   int *dtype, int *mtype)
+void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
 {
-   zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
-                                  *fst_row, (doublecomplex *) nzval, colind, rowptr,
-                                  (Stype_t) *stype, (Dtype_t) *dtype,
-                                  (Mtype_t) *mtype);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zLUstruct_t *LUstruct_ptr = (zLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	zDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+    	zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+    } else { // process layers not equal 0
+        zDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        zDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    zLUstructFree(LUstruct_ptr);
 }
 
-void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+void f_zDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
-   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+    zDestroy_A3d_gathered_on_2d((zSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
 }
 
-void f_Destroy_SuperMat_Store_dist(fptr *A)
+
+void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
+				   int *m_loc, int *fst_row, doublecomplex *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
 {
-   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+#if 1
+    doublecomplex *C_nzval = nzval;
+    int_t *C_colind = colind;
+    int_t *C_rowptr = rowptr;
+#else
+    /* make a copy of matrix A that is internal to the C side */
+    doublecomplex *C_nzval = doublecomplexMalloc_dist(*nnz_loc);
+    int_t *C_colind = intMalloc_dist(*nnz_loc);
+    int_t *C_rowptr = intMalloc_dist(*m_loc + 1);
+    int i;
+    
+    for (i = 0; i < *nnz_loc; ++i) {
+        C_nzval[i] = nzval[i];
+        C_colind[i] = colind[i];
+    }
+    for (i = 0; i <= *m_loc; ++i) {
+        C_rowptr[i] = rowptr[i];
+    }
+#endif
+
+    zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, C_nzval, C_colind, C_rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
 }
 
 void f_zSolveFinalize(fptr *options, fptr *SOLVEstruct)
@@ -306,26 +152,64 @@ void f_pzgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, doublecomplex *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_zcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   doublecomplex *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pzgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 doublecomplex *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int zcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
-			   int_t * , int_t *, gridinfo_t *);
-
-   zcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (doublecomplex *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pzgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (zScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (zLUstruct_t *) *LUstruct,
+	      (zSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
 
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
 }
 
-/* Check malloc */
+/* Create the distributed matrix */
 
-void f_check_malloc(int *iam)
-{
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC((int_t) *iam, "Check Malloc");
-#endif
+void f_zcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+    extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol);
+
+    int iam, nprocs;
+    int nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_zcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+    extern void f_get_gridinfo3d(fptr *grid, int *iam,
+                                 int *nprow, int *npcol, int *npdep);
+
+    int iam, nprocs;
+    int nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
+
diff --git a/FORTRAN/superlu_mod.f90 b/FORTRAN/superlu_mod.f90
index bdfa8191..4a70a8f4 100644
--- a/FORTRAN/superlu_mod.f90
+++ b/FORTRAN/superlu_mod.f90
@@ -1,7 +1,12 @@
 !> @file
 !! \brief This module contains Fortran-side wrappers for the SuperLU
 !! get/set functions.
-!
+!!
+!! <pre>
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! </pre>
 
 module superlu_mod
 
@@ -16,13 +21,18 @@ module superlu_mod
 implicit none
 contains
 
-subroutine get_GridInfo(grid, iam, nprow, npcol)
-  integer(superlu_ptr) :: grid
+subroutine get_GridInfo(grid, iam, nprow, npcol, npdep)
+  integer(superlu_ptr) :: grid     !! can be 2D or 3D grid
   integer*4, optional :: iam
-  integer, optional :: nprow, npcol
-  integer :: l_iam, l_nprow, l_npcol
-
-  call  f_get_gridinfo(grid, l_iam, l_nprow, l_npcol)
+  integer, optional :: nprow, npcol, npdep
+  integer :: l_iam, l_nprow, l_npcol, l_npdep
+
+  if (present(npdep)) then
+     call f_get_gridinfo3d(grid, l_iam, l_nprow, l_npcol, l_npdep)
+     npdep = l_npdep
+  else
+     call f_get_gridinfo(grid, l_iam, l_nprow, l_npcol)
+  endif
 
   if (present(iam)) iam = l_iam
   if (present(nprow)) nprow = l_nprow
diff --git a/FORTRAN/superlupara.f90 b/FORTRAN/superlupara.f90
index d246ae88..7ae58251 100644
--- a/FORTRAN/superlupara.f90
+++ b/FORTRAN/superlupara.f90
@@ -2,6 +2,11 @@
 !! \brief This module contains some parameter used in SuperLU for
 !! Fortran90 user.
 !
+!! <pre>
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! </pre>
 
 module superlupara_mod
 
@@ -14,7 +19,7 @@ module superlupara_mod
 public superlu_ptr
 
 !----------------------------------------------------
-! kind of integer to hold a SuperLU pointer.  Use default integer.
+! kind of integer to hold a SuperLU pointer.  Use 64-bit integer.
 ! This might need to be changed on systems with large memory.
 ! If changed, be sure to change it in superlu_c2f_wrap.c too.
 !
@@ -24,8 +29,8 @@ module superlupara_mod
 !----------------------------------------------------
 ! The following parameters are defined:
 
-! These values come from superlu_defs.h.  If the values in there change with
-! the version of SuperLU, then they need to be changed here, too.
+! These values come from superlu_enum_consts.h. If the values in there
+! change, then they need to be changed here, too.
 
 integer, parameter, public :: &
                       NO                      = 0, & ! yes_no_t
@@ -35,8 +40,9 @@ module superlupara_mod
                       SamePattern_SameRowPerm = 2, &
                       FACTORED                = 3, &
                       NOROWPERM               = 0, & ! rowperm_t
-                      LargeDiag               = 1, &
-                      MY_PERMR                = 2, &
+                      LargeDiag_MC64          = 1, &
+                      LargeDiag_HWPM          = 2, &
+                      MY_PERMR                = 3, &
                       NATURAL                 = 0, & ! colperm_t
                       MMD_ATA                 = 1, &
                       MMD_AT_PLUS_A           = 2, &
@@ -53,15 +59,26 @@ module superlupara_mod
                       COL                     = 2, &
                       BOTH                    = 3, &
                       NOREFINE                = 0, & ! IterRefine_t
-                      SINGLE                  = 1, &
-                      DOUBLE                  = 2, &
-                      EXTRA                   = 3, &
-                      LUSUP                   = 0, & ! MemType  Need?
-                      UCOL                    = 1, &
-                      LSUB                    = 2, &
-                      USUB                    = 3, &
-                      SYSTEM                  = 0, & ! LU_space_t  Need?
-                      USER                    = 1
+                      SLU_SINGLE              = 1, &
+                      SLU_DOUBLE              = 2, &
+                      SLU_EXTRA               = 3, &
+                      USUB                    = 0, & ! MemType
+                      LSUB                    = 1, &
+                      UCOL                    = 2, &
+                      LUSUP                   = 3, &
+                      LLVL                    = 4, &
+                      ULVL                    = 5, &
+                      NO_MEMTYPE              = 6, &
+                      SYSTEM                  = 0, & ! LU_space_t
+                      USER                    = 1, & 
+                      SILU                    = 0, & ! milu_t
+                      SMILU_1                 = 1, &
+                      SMILU_2                 = 2, &
+                      SMILU_3                 = 3
+                      
+! These values come from supermatrix.h. If the values in there
+! change, then they need to be changed here, too.
+
 integer, parameter, public :: &
                       SLU_NC                  = 0, & ! Stype_t
                       SLU_NCP                 = 1, &
@@ -85,7 +102,6 @@ module superlupara_mod
                       SLU_HEL                 = 7, &
                       SLU_HEU                 = 8
 
-
 !----------------------------------------------------
 
 end module superlupara_mod
diff --git a/README.md b/README.md
index 2b3a7de6..cbafbbf9 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SuperLU_DIST (version 6.4)   <img align=center width="55" alt="superlu" src="https://user-images.githubusercontent.com/11741943/103982988-5a9a9d00-5139-11eb-9ac4-a55e80a79f8d.png">
+# SuperLU_DIST (version 7.1.1)   <img align=center width="55" alt="superlu" src="https://user-images.githubusercontent.com/11741943/103982988-5a9a9d00-5139-11eb-9ac4-a55e80a79f8d.png">
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
 [Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
@@ -11,15 +11,69 @@ to run accurately and efficiently on large numbers of processors.
 
 SuperLU_DIST is a parallel extension to the serial SuperLU library.
 It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and 
-column preordering for sparsity are performed sequentially. 
-This "alpha" release contains double-precision real and double-precision
-complex data types.
+SuperLU_DIST is implemented in ANSI C, with OpenMP for on-node parallelism
+and MPI for off-node communications. We are actively developing GPU
+acceleration capabilities.
+<!-- Currently, the LU factorization and triangular solution routines, -->
+<!-- which are the most time-consuming part of the solution process,-->
+<!-- are parallelized. The other routines, such as static pivoting and -->
+<!-- column preordering for sparsity are performed sequentially. -->
+<!-- This "alpha" release contains double-precision real and-->
+<!-- double-precision complex data types.-->
+
+Table of Contents
+=================
+
+* [SuperLU_DIST (version 7.1.1)   <a href="https://user-images.githubusercontent.com/11741943/103982988-5a9a9d00-5139-11eb-9ac4-a55e80a79f8d.png" target="_blank" rel="nofollow"><img align="center" width="55" alt="superlu" src="https://user-images.githubusercontent.com/11741943/103982988-5a9a9d00-5139-11eb-9ac4-a55e80a79f8d.png" style="max-width:100%;"></a>](#superlu_dist-version-70---)
+* [Directory structure of the source code](#directory-structure-of-the-source-code)
+* [Installation](#installation)
+   * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
+      * [Dependent external libraries: BLAS and ParMETIS](#dependent-external-libraries-blas-and-parmetis)
+      * [Optional external libraries: CombBLAS, LAPACK](#optional-external-libraries-combblas-lapack)
+      * [Use GPU](#use-gpu)
+      * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions)
+   * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
+      * [2.1 Edit the make.inc include file.](#21-edit-the-makeinc-include-file)
+      * [2.2. The BLAS library.](#22-the-blas-library)
+      * [2.3. External libraries.](#23-external-libraries)
+         * [2.3.1 Metis and ParMetis.](#231-metis-and-parmetis)
+         * [2.3.2 LAPACK.](#232-lapack)
+         * [2.3.3 CombBLAS.](#233-combblas)
+      * [2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#24-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
+      * [2.5. Multicore and GPU.](#25-multicore-and-gpu)
+* [Summary of the environment variables.](#summary-of-the-environment-variables)
+* [Windows Usage](#windows-usage)
+* [Reading sparse matrix files](#reading-sparse-matrix-files)
+* [REFERENCES](#references)
+* [RELEASE VERSIONS](#release-versions)
+
+Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
+
+# SuperLU_DIST (version 7.0)   <img align=center width="55" alt="superlu" src="https://user-images.githubusercontent.com/11741943/103982988-5a9a9d00-5139-11eb-9ac4-a55e80a79f8d.png">
+
+[![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
+[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
+
+SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
+A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
+Static pivoting is a technique that combines the numerical stability of
+partial pivoting with the scalability of Cholesky (no pivoting),
+to run accurately and efficiently on large numbers of processors. 
+
+SuperLU_DIST is a parallel extension to the serial SuperLU library.
+It is targeted for the distributed memory parallel machines.
+SuperLU_DIST is implemented in ANSI C, with OpenMP for on-node parallelism
+and MPI for off-node communications. We are actively developing GPU
+acceleration capabilities.
+<!-- Currently, the LU factorization and triangular solution routines, -->
+<!-- which are the most time-consuming part of the solution process,-->
+<!-- are parallelized. The other routines, such as static pivoting and -->
+<!-- column preordering for sparsity are performed sequentially. -->
+<!-- This "alpha" release contains double-precision real and-->
+<!-- double-precision complex data types.-->
 
-### The distribution contains the following directory structure:
+
+# Directory structure of the source code
 
 ```
 SuperLU_DIST/README    instructions on installation
@@ -28,6 +82,7 @@ SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
 		       library with multiple OpenMP threads, performance
 		       relies on a good multithreaded BLAS implementation.)
 SuperLU_DIST/DOC/      the Users' Guide
+SuperLU_DIST/FORTRAN/  Fortran90 wrapper functions
 SuperLU_DIST/EXAMPLE/  example programs
 SuperLU_DIST/INSTALL/  test machine dependent parameters
 SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
@@ -41,35 +96,71 @@ SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
 SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
 ```
 
-## INSTALLATION
+# Installation
 
-There are two ways to install the package. One requires users to 
-edit makefile manually, the other uses CMake automatic build system.
+There are two ways to install the package. The first method is to use
+CMake automatic build system. The other method requires users to 
 The procedures are described below.
 
-### Installation option 1: Using CMake build system.
+## Installation option 1: Using CMake build system.
 You will need to create a build tree from which to invoke CMake.
 
-First, in order to use parallel symbolic factorization function, you
+### Dependent external libraries: BLAS and ParMETIS
+If you have a BLAS library on your machine, you can link with it
+with the following cmake definition:
+```
+-DTPL_BLAS_LIBRARIES="<BLAS library name>"
+```
+Otherwise, the CBLAS/ subdirectory contains the part of the C BLAS
+(single threaded) needed by SuperLU_DIST, but they are not optimized.
+You can compile and use it with the following cmake definition:
+```
+-DTPL_ENABLE_INTERNAL_BLASLIB=ON
+```
+
+The default sparsity ordering is METIS. But, in order to use parallel
+symbolic factorization function, you
 need to install ParMETIS parallel ordering package and define the
 two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
 
+(Note: ParMETIS library also contains serial METIS library.)
+
 ```
 export PARMETIS_ROOT=<Prefix directory of the ParMETIS installation>
 export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
 ```
 
-Second, in order to use parallel weighted matching AWPM for numerical
-pre-pivoting, you need to install CombBLAS and define the environment
-variable:
+### Optional external libraries: CombBLAS, LAPACK
+
+In order to use parallel weighted matching HWPM (Heavy Weight
+Perfect Matching) for numerical pre-pivoting, you need to install 
+CombBLAS and define the environment variable:
 
 ```
 export COMBBLAS_ROOT=<Prefix directory of the CombBLAS installation>
 export COMBBLAS_BUILD_DIR=${COMBBLAS_ROOT}/_build
 ```
+Then, install with cmake option:
+```
+-DTPL_ENABLE_COMBBLASLIB=ON
+```
 
-Once these needed third-party libraries are in place, SuperLU installation
-can be done as follows from the top level directory:
+By default, LAPACK is not needed. Only in triangular solve routine, we
+may use LAPACK to explicitly invert the dense diagonal block to improve
+speed. You can use it with the following cmake option:
+```
+-DTPL_ENABLE_LAPACKLIB=ON
+```
+
+### Use GPU
+You can enable GPU with CUDA with the following cmake option:
+```
+-DTPL_ENABLE_CUDALIB=TRUE
+-DTPL_CUDA_LIBRARIES="<path>/libcublas.so;<path>/libcudart.so"
+```
+
+Once these needed third-party libraries are in place, the installation
+can be done as follows at the top level directory:
 
 For a simple installation with default setting, do:
 (ParMETIS is needed, i.e., TPL_ENABLE_PARMETISLIB=ON)
@@ -79,8 +170,7 @@ cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
     -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
 ```
-
-For a more sophisticated installation including third-part libraries, do:
+For a more sophisticated installation including third-party libraries, do:
 ```
 cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
@@ -99,17 +189,11 @@ OOT}/Applications/BipartiteMatchings" \
 
 ( see example cmake script: run_cmake_build.sh )
 ```
-You can enable GPU with CUDA with the following cmake option:
-```
-`-DTPL_ENABLE_CUDALIB=TRUE`
-`-DTPL_CUDA_LIBRARIES="<path>/libcublas.so;<path>/libcudart.so"`
-```
 
-You can disable LAPACK, ParMetis or CombBLAS with the following cmake option:
+You can disable CombBLAS or LAPACK with the following cmake options:
 ```
-`-DTPL_ENABLE_LAPACKLIB=FALSE`
-`-DTPL_ENABLE_PARMETISLIB=FALSE`
-`-DTPL_ENABLE_COMBBLASLIB=FALSE`
+-DTPL_ENABLE_LAPACKLIB=FALSE
+-DTPL_ENABLE_COMBBLASLIB=FALSE
 ```
 
 To actually build (compile), type:
@@ -133,25 +217,38 @@ execution may fail. You can pass the definition option "-DMPIEXEC_EXECUTABLE"
 to cmake. For example on Cori at NERSC, you will need the following:
 `-DMPIEXEC_EXECUTABLE=/usr/bin/srun`
 
-Or, you can always go to TEST/ directory to perform
-testing manually.
-
-**Note on the C-Fortran name mangling handled by C preprocessor definition:**  
-In the default setting, we assume that Fortran expects a C routine
-to have an underscore postfixed to the name. Depending on the
-compiler, you may need to define one of the following flags in
-during the cmake build to overwrite default setting:
-```
-cmake .. -DCMAKE_C_FLAGS="-DNoChange" 
-cmake .. -DCMAKE_C_FLAGS="-DUpCase"
-```
-
-
-### Installation option 2: Manual installation with makefile.
+Or, you can always go to TEST/ directory to perform testing manually.
+
+### Summary of the CMake definitions.
+The following list summarize the commonly used CMake definitions. In each case,
+the first choice is the default setting. After running 'cmake' installation,
+a configuration header file is generated in SRC/superlu_dist_config.h, which
+contains the key CPP definitions used throughout the code.
+```
+    -TPL_ENABLE_PARMETISLIB=ON | OFF
+    -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON
+    -DTPL_ENABLE_LAPACKLIB=OFF | ON
+    -TPL_ENABLE_COMBBLASLIB=OFF
+    -DTPL_ENABLE_CUDALIB=OFF | ON
+    -Denable_complex16=OFF | ON
+    -DXSDK_INDEX_SIZE=32 | 64
+
+    -DBUILD_SHARED_LIBS= OFF | ON
+    -DCMAKE_INSTALL_PREFIX=<...>.
+    -DCMAKE_C_COMPILER=<MPI C compiler>
+    -DCMAKE_C_FLAGS="..." 
+    -DCMAKE_CXX_COMPILER=<MPI C++ compiler>
+    -DMAKE_CXX_FLAGS="..."
+    -DCMAKE_CUDA_FLAGS="..." 
+    -DXSDK_ENABLE_Fortran=OFF | ON
+    -DCMAKE_Fortran_COMPILER=<MPI F90 compiler>
+```
+
+## Installation option 2: Manual installation with makefile.
 Before installing the package, please examine the three things dependent 
 on your system setup:
 
-#### 1.1 Edit the make.inc include file.
+### 2.1 Edit the make.inc include file.
 
 This make include file is referenced inside each of the Makefiles
 in the various subdirectories. As a result, there is no need to 
@@ -183,7 +280,7 @@ printing level to show solver's execution details. (default 0)
 diagnostic printing level for debugging purpose. (default 0)
 ```      
 
-#### 1.2. The BLAS library.
+### 2.2. The BLAS library.
 
 The parallel routines in SuperLU_DIST use some BLAS routines on each MPI
 process. Moreover, if you enable OpenMP with multiple threads, you need to
@@ -210,22 +307,9 @@ top-level SuperLU_DIST/ directory and do the following:
 to make the BLAS library from the routines in the
 ` CBLAS/ subdirectory.`
 
-#### 1.3. External libraries. 
-
-  ##### 1.3.1 LAPACK.
-  Starting Version 6.0, the triangular solve routine can perform explicit
-  inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine.
-  To use this feature, you should define the following in make.inc:
-```
-SLU_HAVE_LAPACK = TRUE
-LAPACKLIB = <lapack library you wish to link with>
-```
-You can disable LAPACK with the following line in SRC/superlu_dist_config.h:
-```
-#undef SLU_HAVE_LAPACK
-```
+### 2.3. External libraries. 
 
-  ##### 1.3.2 Metis and ParMetis.
+  #### 2.3.1 Metis and ParMetis.
 
 If you will use Metis or ParMetis for sparsity ordering, you will
 need to install them yourself. Since ParMetis package already
@@ -243,9 +327,21 @@ I_PARMETIS = -I<parmetis directory>/include -I<parmetis directory>/metis/include
 You can disable ParMetis with the following line in SRC/superlu_dist_config.h:
 ```
 #undef HAVE_PARMETIS
+```
+  #### 2.3.2 LAPACK.
+  Starting Version 6.0, the triangular solve routine can perform explicit
+  inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine.
+  To use this feature, you should define the following in make.inc:
+```
+SLU_HAVE_LAPACK = TRUE
+LAPACKLIB = <lapack library you wish to link with>
+```
+You can disable LAPACK with the following line in SRC/superlu_dist_config.h:
+```
+#undef SLU_HAVE_LAPACK
 ```
 
- ##### 1.3.3 CombBLAS.
+ #### 2.3.3 CombBLAS.
 
 You can use parallel approximate weight perfect matching (AWPM) algorithm
 to perform numerical pre-pivoting for stability. The default pre-pivoting
@@ -265,10 +361,9 @@ You can disable CombBLAS with the following line in SRC/superlu_dist_config.h:
 #undef HAVE_COMBBLAS
 ```
 
+### 2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
 
-#### 1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
-
-In the header file SRC/Cnames.h, we use macros to determine how
+In the header file SRC/superlu_Cnames.h, we use macros to determine how
 C routines should be named so that they are callable by Fortran.
 (Some vendor-supplied BLAS libraries do not have C interfaces. So the 
 re-naming is needed in order for the SuperLU BLAS calls (in C) to 
@@ -283,7 +378,7 @@ The possible options for CDEFS are:
 -DUpCase: Fortran expects a C routine name to be all uppercase.
 ```
 
-#### 1.5. Multicore and GPU (optional).
+### 2.5. Multicore and GPU.
 
 To use OpenMP parallelism, need to link with an OpenMP library, and
 set the number of threads you wish to use as follows (bash):
@@ -302,8 +397,25 @@ A Makefile is provided in each subdirectory. The installation can be done
 completely automatically by simply typing "make" at the top level.
 
 
+# Summary of the environment variables.
+A couple of environment variables affect parallel execution.
+```
+    export OMP_NUM_THREADS=<...>
+    export SUPERLU_ACC_OFFLOAD=1  // this enables use of GPU. Default is 0.
+```
+Several integer blocking parameters may affect performance. Most of them can be
+set by the user through environment variables. Oherwise the default values
+are provided. Various SuperLU routines call an environment inquiry function
+to obtain these parameters. This function is provided in the file SRC/sp_ienv.c.
+Please consult that file for detailed description of the meanings.
+```
+    export NREL=<...>   // supernode relaxation parameter
+    export NSUP=<...>   // maximum allowable supernode size, not to exceed 512
+    export FILL=<...>   // estimated fill ratio of nonzeros(L+U)/nonzeros(A)
+    export MAX_BUFFER_SIZE=<...>   // maximum buffer size on GPU for GEMM
+```
 
-## Windows Usage
+# Windows Usage
 Prerequisites: CMake, Visual Studio, Microsoft HPC Pack
 This has been tested with Visual Studio 2017, without Parmetis,
 without Fortran, and with OpenMP disabled. 
@@ -337,7 +449,7 @@ for the above configuration.
 If you wish to test:
   `ctest`
 
-## READING SPARSE MATRIX FILES
+# Reading sparse matrix files
 
 The SRC/ directory contains the following routines to read different file 
 formats, they all have the similar calling sequence.
@@ -350,7 +462,7 @@ dreadtriple.c          : triplet, with header
 dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
 ```
 
-## REFERENCES
+# REFERENCES
 
 **[1]** X.S. Li and J.W. Demmel, "SuperLU_DIST: A Scalable Distributed-Memory
  Sparse Direct Solver for Unsymmetric Linear Systems", ACM Trans. on Math.
@@ -363,24 +475,34 @@ dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
  Porto, Portugal.  
 **[4]** P. Sao, X.S. Li, R. Vuduc, “A Communication-Avoiding 3D Factorization
  for Sparse Matrices”, Proc. of IPDPS, May 21–25, 2018, Vancouver.   
-**[5]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable
+**[5]** P. Sao, R. Vuduc, X. Li, "Communication-avoiding 3D algorithm for
+ sparse LU factorization on heterogeneous systems", J. Parallel and
+ Distributed Computing (JPDC), September 2019.     
+**[6]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable
  distributed-memory sparse triangular solution algorithms”, Proc. of
  SIAM workshop on Combinatorial Scientific Computing, June 6-8, 2018,
- Bergen, Norway. 
+ Bergen, Norway.   
+**[7]** N. Ding, S. Williams, Y. Liu, X.S. Li, "Leveraging One-Sided
+ Communication for Sparse Triangular Solvers", Proc. of SIAM Conf. on
+ Parallel Processing for Scientific Computing. Feb. 12-15, 2020.   
+**[8]** A. Azad, A. Buluc, X.S. Li, X. Wang, and J. Langguth,
+"A distributed-memory algorithm for computing a heavy-weight perfect matching 
+on bipartite graphs", SIAM J. Sci. Comput., Vol. 42, No. 4, pp. C143-C168, 2020.   
 
-**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)  
+
+**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)   
 **Gustavo Chavez**, Lawrence Berkeley National Lab, [gichavez@lbl.gov](gichavez@lbl.gov)   
+**Nan Ding**, Lawrence Berkeley National Lab, [nanding@lbl.gov](nanding@lbl.gov)  
 **Laura Grigori**, INRIA, France, [laura.grigori@inria.fr](laura.grigori@inria.fr)  
 **Yang Liu**, Lawrence Berkeley National Lab, [liuyangzhuan@lbl.gov](liuyangzhuan@lbl.gov)   
-**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov)   
 **Piyush Sao**, Georgia Institute of Technology, [piyush.feynman@gmail.com](piyush.feynman@gmail.com)  
+**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov)   
 **Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki@gmail.com](ic.yamazaki@gmail.com)  
-**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu)  
+**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu)   
 **John Gilbert**, UC Santa Barbara, [gilbert@cs.ucsb.edu](gilbert@cs.ucsb.edu)
 
 
-
-## RELEASE VERSIONS
+# RELEASE VERSIONS
 ```
 October 15, 2003    Version 2.0  
 October 1,  2007    Version 2.1  
@@ -406,4 +528,5 @@ February 8, 2019    Version 6.1.1
 November 12, 2019   Version 6.2.0
 February 23, 2020   Version 6.3.0
 October 23, 2020    Version 6.4.0
+May 10, 2021        Version 7.0.0
 ```
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index c2eeb9fe..00ebb8b8 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -42,16 +42,26 @@ set(sources
   colamd.c
   superlu_dist_version.c
   comm_tree.c
+  superlu_grid3d.c    ## 3D code
+  supernodal_etree.c
+  supernodalForest.c
+  trfAux.c 
+  communication_aux.c
+  treeFactorization.c
+  sec_structs.c  
 )
+if (HAVE_CUDA)
+  list(APPEND sources superlu_gpu_utils.cu)
+endif()
+
 if (MSVC)
   list(APPEND sources wingetopt.c)
 endif ()
 
-
 set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0)
 
 if(enable_double)
-  list(APPEND headers superlu_ddefs.h)
+  list(APPEND headers superlu_ddefs.h dlustruct_gpu.h)
 
 if(TPL_ENABLE_CUDALIB)
   list(APPEND sources pdgstrs_lsum_cuda.cu)
@@ -97,16 +107,94 @@ endif()
     pdgsrfs_ABXglobal.c
     pdgsmv_AXglobal.c
     pdGetDiagU.c
+    pdgssvx3d.c     ## 3D code
+    dnrformat_loc3d.c 
+    pdgstrf3d.c 
+    dtreeFactorization.c
+    dtreeFactorizationGPU.c
+    dgather.c
+    dscatter3d.c
+    pd3dcomm.c
+    dtrfAux.c	
+    dcommunication_aux.c 
+    dtrfCommWrapper.c
+    dsuperlu_blas.c
   )
+if (HAVE_CUDA)
+  list(APPEND sources dsuperlu_gpu.cu)
+endif()
+
 if (HAVE_COMBBLAS)
   list(APPEND sources d_c2cpp_GetHWPM.cpp dHWPM_CombBLAS.hpp)
 endif()
 
-endif() ## enable double
+endif() ########## enable double
+
+if(enable_single)
+  list(APPEND headers superlu_sdefs.h slustruct_gpu.h)
+
+  list(APPEND sources
+    slangs_dist.c
+    sgsequ_dist.c
+    slaqgs_dist.c
+    sutil_dist.c
+    smemory_dist.c
+    smyblas2_dist.c
+    ssp_blas2_dist.c
+    ssp_blas3_dist.c
+    psgssvx.c
+    psgssvx_ABglobal.c
+    sreadhb.c
+    sreadrb.c
+    sreadtriple.c
+    sreadtriple_noheader.c
+    sbinary_io.c	
+    sreadMM.c
+    psgsequ.c
+    pslaqgs.c
+    sldperm_dist.c
+    pslangs.c
+    psutil.c
+    pssymbfact_distdata.c
+    sdistribute.c
+    psdistribute.c
+    psgstrf.c
+    sstatic_schedule.c
+    psgstrf2.c
+    psgstrs.c
+    psgstrs1.c
+    psgstrs_lsum.c
+    psgstrs_Bglobal.c
+    psgsrfs.c
+    psgsmv.c
+    psgsrfs_ABXglobal.c
+    psgsmv_AXglobal.c
+    psGetDiagU.c
+    psgssvx3d.c     ## 3D code
+    snrformat_loc3d.c 
+    psgstrf3d.c 
+    streeFactorization.c
+    streeFactorizationGPU.c
+    sgather.c
+    sscatter3d.c
+    ps3dcomm.c
+    strfAux.c	
+    scommunication_aux.c 
+    strfCommWrapper.c
+    ssuperlu_blas.c
+  )
+if (HAVE_CUDA)
+  list(APPEND sources ssuperlu_gpu.cu)
+endif()
+if (HAVE_COMBBLAS)
+  list(APPEND sources s_c2cpp_GetHWPM.cpp sHWPM_CombBLAS.hpp)
+endif()
+
+endif() ########## enable single
 
 
 if(enable_complex16)
-  list(APPEND headers superlu_zdefs.h)
+  list(APPEND headers superlu_zdefs.h zlustruct_gpu.h)
 
   list(APPEND sources
     dcomplex_dist.c
@@ -146,37 +234,64 @@ if(enable_complex16)
     pzgsrfs_ABXglobal.c
     pzgsmv_AXglobal.c
     pzGetDiagU.c
+    pzgssvx3d.c   ## 3D code
+    znrformat_loc3d.c 
+    pzgstrf3d.c
+    ztreeFactorization.c 
+    ztreeFactorizationGPU.c 
+    zscatter3d.c
+    zgather.c 
+    pz3dcomm.c ztrfAux.c
+    zcommunication_aux.c
+    ztrfCommWrapper.c
+    zsuperlu_blas.c
   )
-if (HAVE_COMBBLAS)
-  list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp)
+if (HAVE_CUDA)
+    list(APPEND sources zsuperlu_gpu.cu)
 endif()
+if (HAVE_COMBBLAS)
+    list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp)
 endif()
+endif() ######### enable compex16
 
 if (TPL_ENABLE_HIPLIB)
+  file(GLOB MyFiles *.hip.cpp)
   set_source_files_properties(
-    pdgstrs_lsum_cuda.hip.cpp
+    ${MyFiles}
     PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+    
   hip_add_library(superlu_dist "pdgstrs_lsum_cuda.hip.cpp")
+  hip_add_library(superlu_dist "superlu_gpu_utils.hip.cpp")
+  hip_add_library(superlu_dist "dsuperlu_gpu.hip.cpp")
+  if(enable_single)
+    hip_add_library(superlu_dist "ssuperlu_gpu.hip.cpp")
+  endif()
+  if(enable_complex16)
+    hip_add_library(superlu_dist "zsuperlu_gpu.hip.cpp")
+  endif()
 else()
   add_library(superlu_dist "")
 endif()
-# if (BUILD_SHARED_LIBS)
-#   set_property(TARGET superlu_dist PROPERTY POSITION_INDEPENDENT_CODE ON)
-# endif()
     
 if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
 if (TPL_ENABLE_HIPLIB)
   hip_add_library(superlu_dist-static STATIC "pdgstrs_lsum_cuda.hip.cpp")
+  hip_add_library(superlu_dist-static STATIC "dsuperlu_gpu.hip.cpp")
+  if(enable_single)
+    hip_add_library(superlu_dist-static STATIC "ssuperlu_gpu.hip.cpp")
+  endif()
+  if(enable_complex16)
+    hip_add_library(superlu_dist-static STATIC "zsuperlu_gpu.hip.cpp")
+  endif()  
 else()
   add_library(superlu_dist-static STATIC "")
 endif()
 endif()
-# if (BUILD_SHARED_LIBS)
-#   set_property(TARGET superlu_dist-static PROPERTY POSITION_INDEPENDENT_CODE ON)
-# endif()
+
 
 target_sources(superlu_dist PRIVATE ${sources} ${HEADERS})
 set(targets superlu_dist)
+
 if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
   # build both shared and static libs
   target_sources(superlu_dist-static PRIVATE ${sources} ${HEADERS})
@@ -209,6 +324,17 @@ foreach(target ${targets})
     endif()
 endforeach(target)
 
+# Add CUDA runtime library and CUBLAS library
+if(CUDAToolkit_FOUND)  # this is found in top-level CMakeLists.txt
+    target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas)
+endif()
+
+# This is recommended by modern cmake:
+# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html
+if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt
+  target_link_libraries(superlu_dist OpenMP::OpenMP_C)
+endif()
+  
 target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS)
 if(MSVC AND BUILD_SHARED_LIBS)
   set_target_properties(superlu_dist PROPERTIES
diff --git a/SRC/Cnames.h b/SRC/Cnames.h
deleted file mode 100644
index a4d1672d..00000000
--- a/SRC/Cnames.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/*! \file
-Copyright (c) 2003, The Regents of the University of California, through
-Lawrence Berkeley National Laboratory (subject to receipt of any required 
-approvals from U.S. Dept. of Energy) 
-
-All rights reserved. 
-
-The source code is distributed under BSD license, see the file License.txt
-at the top-level directory.
-*/
-/*! @file
- * \brief Macro definitions
- *
- * <pre>
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- * </pre>
- */
-
-#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
-#define __SUPERLU_CNAMES
-
-/*
- * These macros define how C routines will be called.  ADD_ assumes that
- * they will be called by fortran, which expects C routines to have an
- * underscore postfixed to the name (Suns, and the Intel expect this).
- * NOCHANGE indicates that fortran will be calling, and that it expects
- * the name called by fortran to be identical to that compiled by the C
- * (RS6K's do this).  UPCASE says it expects C routines called by fortran
- * to be in all upcase (CRAY wants this). 
- */
-
-#define ADD_       0
-#define NOCHANGE   1
-#define UPCASE     2
-#define C_CALL     3
-
-#ifdef UpCase
-#define F77_CALL_C UPCASE
-#endif
-
-#ifdef NoChange
-#define F77_CALL_C NOCHANGE
-#endif
-
-#ifdef Add_
-#define F77_CALL_C ADD_
-#endif
-
-#ifndef F77_CALL_C
-#define F77_CALL_C ADD_
-#endif
-
-#if (F77_CALL_C == ADD_)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine
- * No redefinition necessary to have following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm_(...)
- *
- * This is the default.
- */
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle_
-#define f_create_options_handle          f_create_options_handle_
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle_
-#define f_create_LUstruct_handle         f_create_lustruct_handle_
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle_
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle_
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle_
-#define f_destroy_options_handle         f_destroy_options_handle_
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle_
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle_
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle_
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle_
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle_
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle_
-#define f_get_gridinfo                   f_get_gridinfo_
-#define f_get_SuperMatrix                f_get_supermatrix_
-#define f_set_SuperMatrix                f_set_supermatrix_
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix_ 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix_
-#define f_get_superlu_options            f_get_superlu_options_
-#define f_set_superlu_options            f_set_superlu_options_
-#define f_set_default_options            f_set_default_options_
-#define f_superlu_gridinit               f_superlu_gridinit_
-#define f_superlu_gridmap                f_superlu_gridmap_
-#define f_superlu_gridexit               f_superlu_gridexit_
-#define f_ScalePermstructInit            f_scalepermstructinit_
-#define f_ScalePermstructFree            f_scalepermstructfree_
-#define f_PStatInit                      f_pstatinit_
-#define f_PStatFree                      f_pstatfree_
-#define f_LUstructInit                   f_lustructinit_
-#define f_LUstructFree                   f_lustructfree_
-#define f_Destroy_LU                     f_destroy_lu_
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist_
-#define f_zCreate_CompRowLoc_Mat_dist    f_zcreate_comprowloc_mat_dist_
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist_
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist_
-#define f_dSolveFinalize                 f_dsolvefinalize_
-#define f_zSolveFinalize                 f_zsolvefinalize_
-#define f_pdgssvx                        f_pdgssvx_
-#define f_pzgssvx                        f_pzgssvx_
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix_
-#define f_zcreate_dist_matrix            f_zcreate_dist_matrix_
-#define f_check_malloc                   f_check_malloc_
-#endif
-
-#if (F77_CALL_C == UPCASE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void DGEMM(...)
- */
-/* BLAS */
-#define sasum_    SASUM
-#define isamax_   ISAMAX
-#define scopy_    SCOPY
-#define sscal_    SSCAL
-#define sger_     SGER
-#define snrm2_    SNRM2
-#define ssymv_    SSYMV
-#define sdot_     SDOT
-#define saxpy_    SAXPY
-#define ssyr2_    SSYR2
-#define srot_     SROT
-#define sgemv_    SGEMV
-#define strsv_    STRSV
-#define sgemm_    SGEMM
-#define strsm_    STRSM
-
-#define dasum_    DASUM
-#define idamax_   IDAMAX
-#define dcopy_    DCOPY
-#define dscal_    DSCAL
-#define dger_     DGER
-#define dnrm2_    DNRM2
-#define dsymv_    DSYMV
-#define ddot_     DDOT
-#define daxpy_    DAXPY
-#define dsyr2_    DSYR2
-#define drot_     DROT
-#define dgemv_    DGEMV
-#define dtrsv_    DTRSV
-#define dgemm_    DGEMM
-#define dtrsm_    DTRSM
-
-#define scasum_   SCASUM
-#define icamax_   ICAMAX
-#define ccopy_    CCOPY
-#define cscal_    CSCAL
-#define scnrm2_   SCNRM2
-#define caxpy_    CAXPY
-#define cgemv_    CGEMV
-#define ctrsv_    CTRSV
-#define cgemm_    CGEMM
-#define ctrsm_    CTRSM
-#define cgerc_    CGERC
-#define chemv_    CHEMV
-#define cher2_    CHER2
-
-#define dzasum_   DZASUM
-#define izamax_   IZAMAX
-#define zcopy_    ZCOPY
-#define zscal_    ZSCAL
-#define dznrm2_   DZNRM2
-#define zaxpy_    ZAXPY
-#define zgemv_    ZGEMV
-#define ztrsv_    ZTRSV
-#define zgemm_    ZGEMM
-#define ztrsm_    ZTRSM
-
-#define zgerc_    ZGERC
-#define zhemv_    ZHEMV
-#define zher2_    ZHER2
-#define zgeru_    ZGERU
-
-/* LAPACK */
-#define strtri_   STRTRI
-#define dtrtri_   DTRTRI
-#define ctrtri_   CTRTRI
-#define ztrtri_   ZTRTRI
-
-/*
-#define mc64id_dist     MC64ID_DIST
-#define mc64ad_dist     MC64AD_DIST
-*/
-#define c_bridge_dgssv_               C_BRIDGE_DGSSV
-#define c_fortran_slugrid_            C_FORTRAN_SLUGRID
-#define c_fortran_pdgssvx_            C_FORTRAN_PDGSSVX
-#define c_fortran_pdgssvx_ABglobal_   C_FORTRAN_PDGSSVX_ABGLOBAL
-#define c_fortran_pzgssvx_            C_FORTRAN_PZGSSVX
-#define c_fortran_pzgssvx_ABglobal_   C_FORTRAN_PZGSSVX_ABGLOBAL
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         F_CREATE_GRIDINFO_HANDLE
-#define f_create_options_handle          F_CREATE_OPTIONS_HANDLE
-#define f_create_ScalePerm_handle        F_CREATE_SCALEPERM_HANDLE
-#define f_create_LUstruct_handle         F_CREATE_LUSTRUCT_HANDLE
-#define f_create_SOLVEstruct_handle      F_CREATE_SOLVESTRUCT_HANDLE
-#define f_create_SuperMatrix_handle      F_CREATE_SUPERMATRIX_HANDLE
-#define f_destroy_gridinfo_handle        F_DESTROY_GRIDINFO_HANDLE
-#define f_destroy_options_handle         F_DESTROY_OPTIONS_HANDLE
-#define f_destroy_ScalePerm_handle       F_DESTROY_SCALEPERM_HANDLE
-#define f_destroy_LUstruct_handle        F_DESTROY_LUSTRUCT_HANDLE
-#define f_destroy_SOLVEstruct_handle     F_DESTROY_SOLVESTRUCT_HANDLE
-#define f_destroy_SuperMatrix_handle     F_DESTROY_SUPERMATRIX_HANDLE
-#define f_create_SuperLUStat_handle      F_CREATE_SUPERLUSTAT_HANDLE
-#define f_destroy_SuperLUStat_handle     F_DESTROY_SUPERLUSTAT_HANDLE
-#define f_get_gridinfo                   F_GET_GRIDINFO
-#define f_get_SuperMatrix                F_GET_SUPERMATRIX
-#define f_set_SuperMatrix                F_SET_SUPERMATRIX
-#define f_get_CompRowLoc_Matrix          F_GET_COMPROWLOC_MATRIX
-#define f_set_CompRowLoc_Matrix          F_SET_COMPROWLOC_MATRIX
-#define f_get_superlu_options            F_GET_SUPERLU_OPTIONS
-#define f_set_superlu_options            F_SET_SUPERLU_OPTIONS
-#define f_set_default_options            F_SET_DEFAULT_OPTIONS
-#define f_superlu_gridinit               F_SUPERLU_GRIDINIT
-#define f_superlu_gridmap                F_SUPERLU_GRIDMAP
-#define f_superlu_gridexit               F_SUPERLU_GRIDEXIT
-#define f_ScalePermstructInit            F_SCALEPERMSTRUCTINIT
-#define f_ScalePermstructFree            F_SCALEPERMSTRUCTFREE
-#define f_PStatInit                      F_PSTATINIT
-#define f_PStatFree                      F_PSTATFREE
-#define f_LUstructInit                   F_LUSTRUCTINIT
-#define f_LUstructFree                   F_LUSTRUCTFREE
-#define f_Destroy_LU                     F_DESTROY_LU
-#define f_dCreate_CompRowLoc_Mat_dist    F_DCREATE_COMPROWLOC_MAT_DIST
-#define f_zCreate_CompRowLoc_Mat_dist    F_ZCREATE_COMPROWLOC_MAT_DIST
-#define f_Destroy_CompRowLoc_Mat_dist    F_DESTROY_COMPROWLOC_MAT_DIST
-#define f_Destroy_SuperMat_Store_dist    F_DESTROY_SUPERMAT_STORE_DIST
-#define f_dSolveFinalize                 F_DSOLVEFINALIZE
-#define f_zSolveFinalize                 F_ZSOLVEFINALIZE
-#define f_pdgssvx                        F_PDGSSVX
-#define f_pzgssvx                        F_PZGSSVX
-#define f_dcreate_dist_matrix            F_DCREATE_DIST_MATRIX
-#define f_zcreate_dist_matrix            F_ZCREATE_DIST_MATRIX
-#define f_check_malloc                   F_CHECK_MALLOC
-#endif
-
-#if (F77_CALL_C == NOCHANGE)
-/*
- * These defines set up the naming scheme required to have a fortran 77
- * routine call a C routine 
- * for following Fortran to C interface:
- *           FORTRAN CALL               C DECLARATION
- *           call dgemm(...)           void dgemm(...)
- */
-/* BLAS */
-#define sasum_    sasum
-#define isamax_   isamax
-#define scopy_    scopy
-#define sscal_    sscal
-#define sger_     sger
-#define snrm2_    snrm2
-#define ssymv_    ssymv
-#define sdot_     sdot
-#define saxpy_    saxpy
-#define ssyr2_    ssyr2
-#define srot_     srot
-#define sgemv_    sgemv
-#define strsv_    strsv
-#define sgemm_    sgemm
-#define strsm_    strsm
-
-#define dasum_    dasum
-#define idamax_   idamax
-#define dcopy_    dcopy
-#define dscal_    dscal
-#define dger_     dger
-#define dnrm2_    dnrm2
-#define dsymv_    dsymv
-#define ddot_     ddot
-#define daxpy_    daxpy
-#define dsyr2_    dsyr2
-#define drot_     drot
-#define dgemv_    dgemv
-#define dtrsv_    dtrsv
-#define dgemm_    dgemm
-#define dtrsm_    dtrsm
-
-#define scasum_   scasum
-#define icamax_   icamax
-#define ccopy_    ccopy
-#define cscal_    cscal
-#define scnrm2_   scnrm2
-#define caxpy_    caxpy
-#define cgemv_    cgemv
-#define ctrsv_    ctrsv
-#define cgemm_    cgemm
-#define ctrsm_    ctrsm
-#define cgerc_    cgerc
-#define chemv_    chemv
-#define cher2_    cher2
-
-#define dzasum_   dzasum
-#define izamax_   izamax
-#define zcopy_    zcopy
-#define zscal_    zscal
-#define dznrm2_   dznrm2
-#define zaxpy_    zaxpy
-#define zgemv_    zgemv
-#define ztrsv_    ztrsv
-#define zgemm_    zgemm
-#define ztrsm_    ztrsm
-#define zgerc_    zgerc
-#define zhemv_    zhemv
-#define zher2_    zher2
-#define zgeru_    zgeru
-
-/* LAPACK */
-#define strtri_   strtri
-#define dtrtri_   dtrtri
-#define ctrtri_   ctrtri
-#define ztrtri_   ztrtri
-
-/*
-#define mc64id_dist         mc64id_dist
-#define mc64ad_dist         mc64ad_dist
-*/
-
-#define c_bridge_dgssv_               c_bridge_dgssv
-#define c_fortran_slugrid_            c_fortran_slugrid
-#define c_fortran_pdgssvx_            c_fortran_pdgssvx
-#define c_fortran_pdgssvx_ABglobal_   c_fortran_pdgssvx_abglobal
-#define c_fortran_pzgssvx_            c_fortran_pzgssvx
-#define c_fortran_pzgssvx_ABglobal_   c_fortran_pzgssvx_abglobal
-
-/* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         f_create_gridinfo_handle
-#define f_create_options_handle          f_create_options_handle
-#define f_create_ScalePerm_handle        f_create_scaleperm_handle
-#define f_create_LUstruct_handle         f_create_lustruct_handle
-#define f_create_SOLVEstruct_handle      f_create_solvestruct_handle
-#define f_create_SuperMatrix_handle      f_create_supermatrix_handle
-#define f_destroy_gridinfo_handle        f_destroy_gridinfo_handle
-#define f_destroy_options_handle         f_destroy_options_handle
-#define f_destroy_ScalePerm_handle       f_destroy_scaleperm_handle
-#define f_destroy_LUstruct_handle        f_destroy_lustruct_handle
-#define f_destroy_SOLVEstruct_handle     f_destroy_solvestruct_handle
-#define f_destroy_SuperMatrix_handle     f_destroy_supermatrix_handle
-#define f_create_SuperLUStat_handle      f_create_superlustat_handle
-#define f_destroy_SuperLUStat_handle     f_destroy_superlustat_handle
-#define f_get_gridinfo                   f_get_gridinfo
-#define f_get_SuperMatrix                f_get_supermatrix
-#define f_set_SuperMatrix                f_set_supermatrix
-#define f_get_CompRowLoc_Matrix          f_get_comprowloc_matrix 
-#define f_set_CompRowLoc_Matrix          f_set_comprowloc_matrix
-#define f_get_superlu_options            f_get_superlu_options
-#define f_set_superlu_options            f_set_superlu_options
-#define f_set_default_options            f_set_default_options
-#define f_superlu_gridinit               f_superlu_gridinit
-#define f_superlu_gridmap                f_superlu_gridmap
-#define f_superlu_gridexit               f_superlu_gridexit
-#define f_ScalePermstructInit            f_scalepermstructinit
-#define f_ScalePermstructFree            f_scalepermstructfree
-#define f_PStatInit                      f_pstatinit
-#define f_PStatFree                      f_pstatfree
-#define f_LUstructInit                   f_lustructinit
-#define f_LUstructFree                   f_lustructfree
-#define f_Destroy_LU                     f_destroy_lu
-#define f_dCreate_CompRowLoc_Mat_dist    f_dcreate_comprowloc_mat_dist
-#define f_Destroy_CompRowLoc_Mat_dist    f_destroy_comprowloc_mat_dist
-#define f_Destroy_SuperMat_Store_dist    f_destroy_supermat_store_dist
-#define f_dSolveFinalize                 f_dsolvefinalize
-#define f_zSolveFinalize                 f_zsolvefinalize
-#define f_pdgssvx                        f_pdgssvx
-#define f_pzgssvx                        f_pzgssvx
-#define f_dcreate_dist_matrix            f_dcreate_dist_matrix
-#define f_zcreate_dist_matrix            f_zcreate_dist_matrix
-#define f_check_malloc                   f_check_malloc
-#endif
-
-#endif /* __SUPERLU_CNAMES */
diff --git a/SRC/Makefile b/SRC/Makefile
index 18b21b01..19402bef 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -26,6 +26,12 @@
 #
 #######################################################################
 include ../make.inc
+
+# FACT3D = 	scatter.o
+#	pdgstrs_vecpar.o ancFactorization.o
+
+#  pddrive_params.o
+
 #
 # Precision independent routines
 #
@@ -35,11 +41,9 @@ ALLAUX 	= sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
 	  psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \
 	  xerr_dist.o smach_dist.o dmach_dist.o \
 	  superlu_dist_version.o TreeInterface.o
-
-ifeq ($(HAVE_CUDA),TRUE)
-ALLAUX += gpublas_utils.o pdgstrs_lsum_cuda.o pdgstrs_lsum_cuda_dlink.o
-endif
-
+# Following are from 3D code
+ALLAUX += superlu_grid3d.o supernodal_etree.o supernodalForest.o \
+	trfAux.o communication_aux.o treeFactorization.o sec_structs.o
 #
 # Routines literally taken from SuperLU, but renamed with suffix _dist
 #
@@ -58,7 +62,12 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \
 	  pdgstrf.o dstatic_schedule.o pdgstrf2.o pdGetDiagU.o \
 	  pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \
 	  pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \
-	  dreadtriple_noheader.o
+	  dreadtriple_noheader.o dsuperlu_blas.o
+# from 3D code
+DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \
+	dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o \
+	dnrformat_loc3d.o dtreeFactorizationGPU.o ##$(FACT3D)
+
 #
 # Routines for double complex parallel SuperLU
 ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
@@ -68,7 +77,17 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
 	  pzgstrf.o zstatic_schedule.o pzgstrf2.o pzGetDiagU.o \
 	  pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \
 	  pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
-	  zreadtriple_noheader.o
+	  zreadtriple_noheader.o zsuperlu_blas.o
+# from 3D code
+ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \
+	zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \
+	znrformat_loc3d.o ztreeFactorizationGPU.o ##$(FACT3D)
+
+ifeq ($(HAVE_CUDA),TRUE)
+ALLAUX += cublas_utils.o superlu_gpu_utils.o pdgstrs_lsum_cuda.o
+DPLUSRC += dsuperlu_gpu.o
+ZPLUSRC += zsuperlu_gpu.o
+endif
 
 ifeq ($(HAVE_COMBBLAS),TRUE)
 DPLUSRC += d_c2cpp_GetHWPM.o
@@ -124,13 +143,9 @@ pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c
 .c.o:
 	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE)
 
-pdgstrs_lsum_cuda.o: pdgstrs_lsum_cuda.cu
-	$(NVCC) $(CUDACFLAGS) -dc $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c pdgstrs_lsum_cuda.cu $(VERBOSE)
+.cu.o:
+	$(NVCC) $(NVCCFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE)
 
-pdgstrs_lsum_cuda_dlink.o:	
-	$(NVCC) $(CUDACFLAGS) -dlink $(CUDALIBS) -o pdgstrs_lsum_cuda_dlink.o pdgstrs_lsum_cuda.o $(VERBOSE)	
-	
-	
 .cpp.o:
 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE)
 
diff --git a/SRC/TreeInterface.cpp b/SRC/TreeInterface.cpp
new file mode 100644
index 00000000..60c957c5
--- /dev/null
+++ b/SRC/TreeInterface.cpp
@@ -0,0 +1,337 @@
+#include "TreeReduce_slu.hpp"
+#include "dcomplex.h"
+
+namespace SuperLU_ASYNCOMM{
+	
+	
+	
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+	BcTree BcTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed, char precision){
+		assert(msgSize>0);
+		if(precision=='d'){
+			TreeBcast_slu<double>* BcastTree = TreeBcast_slu<double>::Create(comm,ranks,rank_cnt,msgSize,rseed);
+			return (BcTree) BcastTree;
+		}
+		if(precision=='s'){
+			TreeBcast_slu<float>* BcastTree = TreeBcast_slu<float>::Create(comm,ranks,rank_cnt,msgSize,rseed);
+			return (BcTree) BcastTree;
+		}
+		if(precision=='z'){
+			TreeBcast_slu<doublecomplex>* BcastTree = TreeBcast_slu<doublecomplex>::Create(comm,ranks,rank_cnt,msgSize,rseed);		
+			return (BcTree) BcastTree;
+		}
+	}
+
+	void BcTree_Destroy(BcTree Tree, char precision){
+		if(precision=='d'){
+			TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+			delete BcastTree; 
+		}
+		if(precision=='s'){
+			TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+			delete BcastTree; 
+		}
+		if(precision=='z'){
+			TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+			delete BcastTree; 
+		}
+
+	}		
+	
+	void BcTree_SetTag(BcTree Tree, Int tag, char precision){
+	    if(precision=='d'){
+	        TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+		BcastTree->SetTag(tag); 
+	    }
+	    if(precision=='s'){
+	        TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+		BcastTree->SetTag(tag); 
+	    }
+	    if(precision=='z'){
+		TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+		BcastTree->SetTag(tag); 
+	    }
+	}
+
+
+	yes_no_t BcTree_IsRoot(BcTree Tree, char precision){
+		if(precision=='d'){
+		  TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+		  return BcastTree->IsRoot()?YES:NO;
+		}
+		if(precision=='s'){
+		  TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+		  return BcastTree->IsRoot()?YES:NO;
+		}
+		if(precision=='z'){
+		TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+		return BcastTree->IsRoot()?YES:NO;
+		}
+	}
+
+	
+	void BcTree_forwardMessageSimple(BcTree Tree, void* localBuffer, Int msgSize, char precision){
+	    if(precision=='d'){
+	      TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+	      BcastTree->forwardMessageSimple((double*)localBuffer,msgSize);	
+	    }
+	    if(precision=='s'){
+	      TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+	      BcastTree->forwardMessageSimple((float*)localBuffer,msgSize);	
+	    }
+	    if(precision=='z'){
+	      TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+	      BcastTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize);	
+	    }	
+	}
+
+	void BcTree_waitSendRequest(BcTree Tree, char precision) {
+	  if(precision=='d'){
+	    TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+	    BcastTree->waitSendRequest();		
+	  }
+	  if(precision=='s'){
+	    TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+	    BcastTree->waitSendRequest();		
+	  }
+	  if(precision=='z'){
+	    TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+	    BcastTree->waitSendRequest();		
+	  }
+	}
+	
+	void BcTree_allocateRequest(BcTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+	    BcastTree->allocateRequest();
+	  }
+	  if(precision=='s'){
+	    TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+	    BcastTree->allocateRequest();
+	  }
+	  if(precision=='z'){
+	    TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+	    BcastTree->allocateRequest();
+	  }			
+	}	
+	
+	int BcTree_getDestCount(BcTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+	    return BcastTree->GetDestCount();					
+	  }
+	  if(precision=='s'){
+	    TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+	    return BcastTree->GetDestCount();					
+	  }
+	  if(precision=='z'){
+	    TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+	    return BcastTree->GetDestCount();					
+	  }
+	}	
+
+	int BcTree_GetMsgSize(BcTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeBcast_slu<double>* BcastTree = (TreeBcast_slu<double>*) Tree;
+	    return BcastTree->GetMsgSize();					
+	  }
+	  if(precision=='s'){
+	    TreeBcast_slu<float>* BcastTree = (TreeBcast_slu<float>*) Tree;
+	    return BcastTree->GetMsgSize();					
+	  }
+	  if(precision=='z'){
+	    TreeBcast_slu<doublecomplex>* BcastTree = (TreeBcast_slu<doublecomplex>*) Tree;
+	    return BcastTree->GetMsgSize();					
+	  }
+	}		
+	
+	StdList StdList_Init(){
+		std::list<int_t>* lst = new std::list<int_t>();
+		return (StdList) lst;
+	}
+	void StdList_Pushback(StdList lst, int_t dat){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		list->push_back(dat);
+	}
+	
+	void StdList_Pushfront(StdList lst, int_t dat){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		list->push_front(dat);
+	}
+	
+	int_t StdList_Popfront(StdList lst){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		int_t dat = -1;
+		if((*list).begin()!=(*list).end()){
+			dat = (*list).front();
+			list->pop_front();
+		}
+		return dat;		
+	}	
+	
+	yes_no_t StdList_Find(StdList lst, int_t dat){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		for (std::list<int_t>::iterator itr = (*list).begin(); itr != (*list).end(); /*nothing*/){
+			if(*itr==dat)return YES;
+			++itr;
+		}
+		return NO;
+	}
+
+	int_t StdList_Size(StdList lst){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		return list->size();
+	}	
+
+
+	yes_no_t StdList_Empty(StdList lst){
+		std::list<int_t>* list = (std::list<int_t>*) lst;
+		return (*list).begin()==(*list).end()?YES:NO;
+	}		
+	
+
+	RdTree RdTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed, char precision){
+	  assert(msgSize>0);
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = TreeReduce_slu<double>::Create(comm,ranks,rank_cnt,msgSize,rseed);
+	    return (RdTree) ReduceTree;
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = TreeReduce_slu<float>::Create(comm,ranks,rank_cnt,msgSize,rseed);
+	    return (RdTree) ReduceTree;
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = TreeReduce_slu<doublecomplex>::Create(comm,ranks,rank_cnt,msgSize,rseed);
+	    return (RdTree) ReduceTree;
+	  }
+	}
+	
+	void RdTree_Destroy(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    delete ReduceTree; 
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    delete ReduceTree; 
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    delete ReduceTree;
+	  }		
+	}	
+	
+
+	void RdTree_SetTag(RdTree Tree, Int tag, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    ReduceTree->SetTag(tag); 		
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    ReduceTree->SetTag(tag); 		
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    ReduceTree->SetTag(tag); 
+	  }
+	}
+
+	int  RdTree_GetDestCount(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    return ReduceTree->GetDestCount();		
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    return ReduceTree->GetDestCount();		
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    return ReduceTree->GetDestCount();		
+	  }
+	}	
+	
+	int  RdTree_GetMsgSize(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    return ReduceTree->GetMsgSize();		
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    return ReduceTree->GetMsgSize();		
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    return ReduceTree->GetMsgSize();		
+	  }
+	}		
+	
+	yes_no_t RdTree_IsRoot(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    return ReduceTree->IsRoot()?YES:NO;
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    return ReduceTree->IsRoot()?YES:NO;
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    return ReduceTree->IsRoot()?YES:NO;
+	  }
+	}
+
+	void RdTree_forwardMessageSimple(RdTree Tree, void* localBuffer, Int msgSize, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    ReduceTree->forwardMessageSimple((double*)localBuffer,msgSize);	
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    ReduceTree->forwardMessageSimple((float*)localBuffer,msgSize);	
+	  }
+	  if(precision=='z'){TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    ReduceTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize);	
+	  }
+	}
+
+	void RdTree_allocateRequest(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    ReduceTree->allocateRequest();
+	  }		
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    ReduceTree->allocateRequest();
+	  }		
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    ReduceTree->allocateRequest();		
+	  }
+	}
+
+	void RdTree_waitSendRequest(RdTree Tree, char precision){
+	  if(precision=='d'){
+	    TreeReduce_slu<double>* ReduceTree = (TreeReduce_slu<double>*) Tree;
+	    ReduceTree->waitSendRequest();			
+	  }
+	  if(precision=='s'){
+	    TreeReduce_slu<float>* ReduceTree = (TreeReduce_slu<float>*) Tree;
+	    ReduceTree->waitSendRequest();			
+	  }
+	  if(precision=='z'){
+	    TreeReduce_slu<doublecomplex>* ReduceTree = (TreeReduce_slu<doublecomplex>*) Tree;
+	    ReduceTree->waitSendRequest();	
+	  }		
+	}
+	
+#ifdef __cplusplus
+	}
+#endif
+	
+} //namespace SuperLU_ASYNCOMM
+
diff --git a/SRC/acc_aux.c b/SRC/acc_aux.c
new file mode 100644
index 00000000..d62d0c4e
--- /dev/null
+++ b/SRC/acc_aux.c
@@ -0,0 +1,665 @@
+#include "acc_aux.h"
+
+#define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+
+// int
+// get_thread_per_process ()
+// {
+//     char *ttemp;
+//     ttemp = getenv ("THREAD_PER_PROCESS");
+
+//     if (ttemp)
+//         return atoi (ttemp);
+//     else
+//         return 1;
+// }
+
+
+static inline double
+load_imb (double *A, int nthreads)
+{
+    int i;
+    double _mx, _avg;
+    _mx = 0;
+    _avg = 0;
+    for (i = 0; i < nthreads; i++)
+    {
+        _mx = (((_mx) > (A[i])) ? (_mx) : (A[i]));
+        _avg += A[i];
+    }
+    _avg = _avg / (double) nthreads;
+    return _mx - _avg;
+}
+
+
+
+// int_t
+// get_max_buffer_size ()
+// {
+//     char *ttemp;
+//     ttemp = getenv ("MAX_BUFFER_SIZE");
+//     if (ttemp)
+//         return atoi (ttemp);
+//     else
+//         return 5000000;
+// }
+
+
+// #define ACC_ASYNC_COST 3.79e-3
+
+#define  MAX_DIM 12800
+#define  MAX_IN_DIM 256
+#define  LOG_2_MAX_IN_DIM 8
+#define  LOG_2_MAX_DIM 7
+
+
+double get_acc_async_cost()
+{
+    char *ttemp;
+    ttemp = getenv ("ACC_ASYNC_COST");
+    if (ttemp)
+        return (double) atof (ttemp);
+    else
+        return 4e-3;
+}
+
+// #define  CPU_BANDWIDTH 35.0
+
+double cpu_bandwidth;
+int communication_overlap;
+double acc_async_cost;
+
+
+int_t fixed_partition;
+double frac;
+
+/* Sherry: these lookup tables are not needed on Titan, nor Summit */
+double CpuDgemmLookUp[8][8][9];
+double PhiDgemmLookUp[8][8][9];
+double PhiBWLookUp[8];       // no used?
+double MicPciBandwidth[18];  // no used?
+double MicScatterBW[24][24];
+
+#ifdef OFFLOAD_PROFILE
+double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING];
+double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
+#endif
+
+static inline unsigned int next_power_2(unsigned int v)
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+}
+
+
+static inline unsigned int previous_power_2(unsigned int v)
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v / 2;
+}
+
+
+#include <stdint.h>
+// static inline uint32_t my_log2(const uint32_t x) {
+//     uint32_t y;
+//     asm ( "\tbsr %1, %0\n"
+//           : "=r"(y)
+//           : "r" (x)
+//         );
+//     return y;
+// }
+
+static inline uint32_t my_log2(const uint32_t x)
+{
+    return (uint32_t) log2((double) x);
+}
+
+static inline unsigned int nearst_2_100(unsigned int v)
+{
+    v = (v + 99) / 100;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return SUPERLU_MIN(my_log2(v), LOG_2_MAX_DIM) ;
+}
+
+static inline unsigned int nearst_k(unsigned int v)
+{
+
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return SUPERLU_MIN(my_log2(v), LOG_2_MAX_IN_DIM) ;
+}
+
+
+
+double estimate_acc_time(int m, int n , int k)
+{
+    double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+
+    double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))];
+
+    double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ;
+    // printf("gemm_time %.2e scatter_time %.2e, flop_rate %lf mop_rate %lf ",gemm_time, scatter_time, flop_rate,mop_rate);
+    if (gemm_time < 0)
+    {
+        /* code */
+        printf(" m %d n %d k %d \n", m, n, k);
+        exit(0);
+    }
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (gemm_time + scatter_time) / eta;
+}
+
+
+
+double estimate_acc_gemm_time(int m, int n , int k)
+{
+    double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (gemm_time) / eta;
+}
+
+
+double estimate_acc_scatter_time(int m, int n , int k)
+{
+
+    double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))];
+
+    double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ;
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (scatter_time) / eta;
+}
+
+double estimate_cpu_time(int m, int n , int k)
+{
+    if (m == 0 || n == 0 || k == 0)
+    {
+        return 0;
+    }
+    double flop_rate = CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+    double scatter_time = 24e-9 * (double) m * (double)n / cpu_bandwidth ;
+    return gemm_time + scatter_time;
+}
+
+
+double acc_data_send_time(size_t sz)
+{
+    if (my_log2((sz + 999) / 1000) > 17 ) return 1e-9 * (double) sz / MicPciBandwidth[17];
+    return 1e-9 * (double) sz / MicPciBandwidth[my_log2((sz + 999) / 1000)];
+}
+
+
+void LookUpTableInit(int my_rank)
+{
+    char *ttemp;
+    char buffer[1024];
+    char *line;
+    FILE *fp;
+
+    ttemp = getenv("CPU_BANDWIDTH");
+    if (ttemp)
+    {
+        cpu_bandwidth = atof(ttemp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("Bandwidth of CP is %lf \n", cpu_bandwidth );
+#endif
+    }
+    else
+    {
+        printf("Please set CPU_BANDWIDTH : bbye\n");
+        exit(0);
+
+    }
+
+    // ttemp = getenv("SLU_ACC_COMM_OVERLAP");
+    // if (ttemp)
+    // {
+    //     communication_overlap = atoi(ttemp);
+    //     if (!my_rank && communication_overlap ) printf("Using communication computation overlap version\n");
+    // }
+    // else
+    // {
+    //     printf("Please set SLU_ACC_COMM_OVERLAP : bbye\n");
+    //     exit(0);
+    // }
+
+
+    /*Reading CPU performance table */
+    ttemp = getenv("CPU_DGEMM_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_flop_rate = 0;
+        if (!fp)
+        {
+            if (!my_rank) printf("can not open %s: exiting  \n", ttemp);
+            exit(0);
+        }
+
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m, n, k;
+            double flop_rate;
+            sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate);
+            CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate;
+            max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate);
+        }
+        fclose(fp);
+        // printf("CPU: MAX FLOP Rate %lf GFLOP/Sec\n",max_flop_rate );
+    }
+    else
+    {
+        printf("Assign CPU performance table \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_DGEMM_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        if (!fp)
+        {
+            printf("can not open %s: exiting  \n", ttemp);
+            exit(0);
+        }
+        double max_flop_rate = 0;
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m, n, k;
+            double flop_rate;
+            sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate);
+            PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate;
+            max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate);
+        }
+        fclose(fp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("ACC: MAX FLOP Rate %lf GFLOP/Sec\n", max_flop_rate );
+#endif
+    }
+    else
+    {
+        printf("Assign ACC DGEMM performance table \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_SCATTER_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_mop_rate = 0;
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m;
+            double mop_rate, sd;
+            sscanf(line, "%d, %lf, %lf", &m, &mop_rate, &sd);
+            PhiBWLookUp[nearst_2_100(m)] = mop_rate;
+            max_mop_rate = SUPERLU_MAX(mop_rate, max_mop_rate);
+        }
+        fclose(fp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("ACC: MAX MOP Rate %lf GFLOP/Sec\n", max_mop_rate );
+#endif
+    }
+    else
+    {
+        printf("Assign ACC DGEMM performance table \n");
+        exit(0);
+    }
+
+
+    ttemp = getenv("ACC_PCI_BW_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m;
+            double bw;
+            sscanf(line, "%d,%lf", &m, &bw);
+            MicPciBandwidth[my_log2(m / 1000)] = bw;
+
+        }
+        fclose(fp);
+    }
+    else
+    {
+        printf("Assign ACC_PCI_BW_TABLE \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_SCATTER_BW_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+
+
+        for (int i = 0; i < 24; ++i)
+        {
+            for (int j = 0; j < 24; ++j)
+            {
+                fscanf(fp, "%lf", &MicScatterBW[i][j]);
+                // printf("%d  %d %lf\n",i,j,MicScatterBW[i][j] );
+            }
+        }
+
+
+        fclose(fp);
+    }
+    else
+    {
+        printf("Assign ACC_SCATTER_BW_TABLE \n");
+        exit(0);
+    }
+
+#ifdef OFFLOAD_PROFILE
+    ttemp = getenv("ACC_SCATTER_TLI_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_mop_rate = 0;
+
+        for (int i = 0; i < MAX_BLOCK_SIZE / STEPPING; ++i)
+        {
+            for (int j = 0; j < MAX_BLOCK_SIZE / STEPPING; ++j)
+            {
+
+                fscanf(fp, "%lf", &MicScatterTLI[i][j]);
+                if (MicScatterTLI[i][j] > 2)
+                {
+                    MicScatterTLI[i][j] = 2;
+                }
+                // printf("%lf \n", MicScatterTLI[i][j]);
+            }
+        }
+
+
+        fclose(fp);
+    }
+    else
+    {
+        printf("ACC_SCATTER_TLI_TABLE \n");
+        exit(0);
+    }
+
+    ttemp = getenv("HOST_SCU_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        for (int_t k = 0; k < CBLOCK / CSTEPPING; ++k)
+        {
+
+            for (int_t i = 0; i < CBLOCK / CSTEPPING; ++i)
+            {
+                for (int_t j = 0; j < CBLOCK / CSTEPPING; ++j)
+                {
+                    fscanf(fp, "%lf", &host_scu_flop_rate[k][i][j]);
+
+                }
+
+            }
+        }
+        fclose(fp);
+    }
+    else
+    {
+        printf("please assign HOST_SCU_PERF_TABLE \n");
+        exit(0);
+    }
+
+#endif
+
+    ttemp = getenv("FIXED_PARTITION");
+    if (ttemp)
+    {
+        fixed_partition = atoi(ttemp);
+        if (fixed_partition)
+        {
+            printf("Using fixed workload partition \n");
+            ttemp = getenv("CPU_ACC_WORK_PARTITION");
+            if (ttemp)
+            {
+                frac  = atof (ttemp);
+            }
+            else
+            {
+                frac = 1;
+            }
+
+        }
+
+    }
+    else
+    {
+        fixed_partition = 0;
+    }
+
+} /* end LookupTableInit */
+
+
+double l_count[24];    /*used for keeping entries*/
+double u_count[24];     /*for keeping u entries*/
+
+double
+estimate_acc_scatter_time_strat1(Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb )
+{
+    for (int i = 0; i < 24; ++i)
+    {
+        l_count[i] = 0;
+        u_count[i] = 0;
+    }
+
+    int_t cum_nrows = 0;
+    int_t cum_ncols = 0;
+    for (int i = 0; i < nub; ++i)
+    {
+        int_t ncols = Ublock_info[i].ncols;
+        int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0);
+        u_count[ind] += (double) ncols;
+        cum_ncols += ncols;
+
+    }
+
+
+    for (int i = 0; i < nlb; ++i)
+    {
+        int_t nrows = Lblock_info[i].nrows;
+        int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0);
+        l_count[ind] += (double) nrows;
+
+        cum_nrows += nrows;
+
+    }
+
+    double ttime = 0;
+    for (int i = 0; i < 24; ++i)
+    {
+        for (int j = 0; j < 24; ++j)
+        {
+            /* code */
+            ttime += 8 * 3e-9 * l_count[i] * u_count[j] / MicScatterBW[i][j];
+        }
+    }
+
+    // ttime *= (MicScatterTLI[CLAMP( CEILING(cum_nrows, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )][CLAMP( CEILING(cum_ncols, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )]) ;
+    ttime *= SUPERLU_MIN(nub * nlb / 240 , 1);
+    return ttime;
+}
+
+#ifdef OFFLOAD_PROFILE
+/*following is a good strategy; gives good prediction but for some reason I do not get over all performance
+improvement so I've ommited this thing out*/
+double
+estimate_cpu_sc_time_strat1(int_t ldu, Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb )
+{
+    int_t ind_k = SUPERLU_MAX(CEILING(ldu, 8) - 1, 0);
+    for (int i = 0; i < 24; ++i)
+    {
+        l_count[i] = 0;
+        u_count[i] = 0;
+    }
+
+    int_t cum_nrows = 0;
+    int_t cum_ncols = 0;
+    for (int i = 0; i < nub; ++i)
+    {
+        int_t ncols = Ublock_info[i].ncols;
+        int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0);
+        u_count[ind] += (double) ncols;
+        cum_ncols += ncols;
+
+    }
+
+
+    for (int i = 0; i < nlb; ++i)
+    {
+        int_t nrows = Lblock_info[i].nrows;
+        int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0);
+        l_count[ind] += (double) nrows;
+        cum_nrows += nrows;
+    }
+
+    double ttime = 0;
+    for (int i = 0; i < 24; ++i)
+    {
+        for (int j = 0; j < 24; ++j)
+        {
+            /* flop rate is in gf/sec */
+            ttime += 2e-9 * ldu * l_count[i] * u_count[j] / host_scu_flop_rate[ind_k][i][j];
+        }
+    }
+
+    return ttime;
+}
+
+#endif
+
+/* Sherry: this routine is not called */
+int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi)
+{
+    int_t acc_cols, cpu_cols;
+    int_t total_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols;
+    if (frac == 0)
+    {
+        return num_u_blks_Phi;
+    }
+    else if (frac == 1)
+    {
+        return 0;
+    }
+
+    for (int_t j = num_u_blks_Phi - 1; j > -1; --j)      // ###
+    {
+
+        acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ;
+        cpu_cols = total_cols - acc_cols;
+
+        if (estimate_acc_time (Rnbrow, acc_cols, ldu_Phi) < acc_async_cost)
+        {
+            break;
+        }
+        if (cpu_cols > frac * total_cols )
+        {
+            return j;
+        }
+
+    }
+
+    return 0;
+}
+
+
+/* Partition the "num_u_blks_Phi" portion into GPU and CPU part,
+   based on the estimated computational cost on CPU and GPU.
+   This was useful for the old Intel Phi architecture, but for the
+   new architecture, such as Titan and Summit, we can give everything
+   to GPU.
+*/
+int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info, int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi )
+{
+    double cpu_time, acc_time;
+    int_t acc_cols, cpu_cols;
+
+    for (int_t j = num_u_blks_Phi - 1; j > -1; --j)      // ###
+    {
+
+        acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ;
+        cpu_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols - acc_cols;
+        acc_time = estimate_acc_scatter_time_strat1(&Ublock_info_Phi[0], j,
+                   Remain_info,  RemainBlk ) + estimate_acc_gemm_time(Rnbrow, acc_cols, ldu_Phi);
+
+        cpu_time = estimate_cpu_time(Rnbrow, cpu_cols, ldu_Phi) + cpu_time_0;
+
+
+        // SCT.Predicted_host_sch_time[k0] = cpu_time_without_offload;
+        if (cpu_time > acc_time + acc_async_cost)
+        {
+            return j;
+
+        }
+    }
+
+    return 0; /*default value is zero */
+}
+
+
diff --git a/SRC/acc_aux.h b/SRC/acc_aux.h
new file mode 100644
index 00000000..b63ca119
--- /dev/null
+++ b/SRC/acc_aux.h
@@ -0,0 +1,48 @@
+#pragma once
+
+// #include "pdgstrf.h"
+
+
+typedef struct mdwin_t
+{
+	double cpu_bandwidth;
+	int communication_overlap;
+	double acc_async_cost;
+
+
+	int_t fixed_partition;
+	double frac;
+
+	double CpuDgemmLookUp[8][8][9];
+	double PhiDgemmLookUp[8][8][9];
+	double PhiBWLookUp[8];
+	double MicPciBandwidth[18];
+	double MicScatterBW[24][24];
+
+#ifdef OFFLOAD_PROFILE
+	double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING];
+	double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
+#endif
+} mdwin_t;
+
+int_t
+get_max_buffer_size ();
+
+double get_acc_async_cost();
+
+double estimate_acc_time(int m, int n , int k);
+
+double estimate_acc_gemm_time(int m, int n , int k);
+
+double estimate_acc_scatter_time(int m, int n , int k);
+
+double estimate_cpu_time(int m, int n , int k);
+
+double acc_data_send_time(size_t sz);
+
+void LookUpTableInit(int my_rank);
+
+
+int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi);
+int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info,
+                      int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi );
\ No newline at end of file
diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c
new file mode 100644
index 00000000..ff0034fc
--- /dev/null
+++ b/SRC/communication_aux.c
@@ -0,0 +1,236 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Auxiliary routines to support communication in 3D algorithms
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+
+#include "superlu_defs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+
+int_t Wait_LSend
+/*wait till broadcast of L finished*/
+(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *send_req, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t lk = LBj (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Status status;
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        /* Wait for Isend to complete before using lsub/lusup. */
+        if (ToSendR[lk][pj] != EMPTY && pj != mycol)
+        {
+            MPI_Wait (&send_req[pj], &status);
+            MPI_Wait (&send_req[pj + Pc], &status);
+        }
+    }
+    SCT->Wait_LSend_tl += ( SuperLU_timer_() - t1);
+    return 0;
+}
+
+
+int_t Wait_USend
+/*wait till broadcast of U panels finished*/
+( MPI_Request *send_req, gridinfo_t *grid, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Status status;
+    for (int_t pi = 0; pi < Pr; ++pi)
+    {
+        if (pi != myrow)
+        {
+            MPI_Wait (&send_req[pi], &status);
+            MPI_Wait (&send_req[pi + Pr], &status);
+        }
+    }
+    SCT->Wait_USend_tl += (double) (SuperLU_timer_() - t1);
+    return 0;
+}
+
+
+int_t Check_LRecv
+/*checks if diagnoal blocks have been received*/
+
+(  MPI_Request* recv_req, int* msgcnt )
+{
+    int flag0, flag1;
+    MPI_Status status;
+
+    flag0 = flag1 = 0;
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Test (&recv_req[0], &flag0, &status);
+        if (flag0)
+        {
+            MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+            recv_req[0] = MPI_REQUEST_NULL;
+        }
+    }
+    else
+        flag0 = 1;
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Test (&recv_req[1], &flag1, &status);
+        if (flag1)
+        {
+            MPI_Get_count (&status, mpi_int_t, &msgcnt[1]);
+            recv_req[1] = MPI_REQUEST_NULL;
+        }
+    }
+    else
+        flag1 = 1;
+
+    return flag1 && flag0;
+}
+
+
+int_t Wait_UDiagBlockSend(MPI_Request *U_diag_blk_send_req,
+                          gridinfo_t * grid, SCT_t* SCT)
+{
+
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Status status;
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            MPI_Wait (U_diag_blk_send_req + pr, &status);
+        }
+    }
+    SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1);
+    return 0;
+}
+
+int_t Wait_LDiagBlockSend(MPI_Request *L_diag_blk_send_req,
+                          gridinfo_t * grid, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Status status;
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            MPI_Wait (L_diag_blk_send_req + pc, &status);
+        }
+    }
+    SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1);
+    return 0;
+}
+
+
+int_t Wait_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait(request, &status);
+    SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
+    return 0;
+}
+
+int_t Test_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    int flag;
+    MPI_Test(request,&flag, &status);
+    SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
+    return flag;
+
+}
+
+int_t Wait_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait(request, &status);
+    SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
+    return 0;
+
+}
+
+int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
+{
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    int flag;
+    MPI_Test(request, &flag, &status);
+    SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
+    return flag;
+}
+
+/*
+ * The following are from trfCommWrapper.c.
+ */
+int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req,
+                      MPI_Request *L_diag_blk_send_req,
+                      gridinfo_t *grid, SCT_t *SCT)
+{
+    // Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    // LocalLU_t *Llu = LUstruct->Llu;
+    // int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    if (iam == pkk)
+    {
+        Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT);
+        Wait_LDiagBlockSend(L_diag_blk_send_req, grid, SCT);
+    }
+
+    return 0;
+}
+
+
+
+int_t LDiagBlockRecvWait( int_t k,   int_t* factored_U,
+                          MPI_Request * L_diag_blk_recv_req,
+                          gridinfo_t *grid)
+{
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    
+      /*factor the U panel*/
+    if (myrow == krow && factored_U[k] == 0 && iam != pkk)
+    {
+        factored_U[k] = 1;
+        MPI_Status status;
+        MPI_Wait(L_diag_blk_recv_req, &status);
+    }
+    return 0;
+}
+
diff --git a/SRC/dHWPM_CombBLAS.hpp b/SRC/dHWPM_CombBLAS.hpp
index 53370d39..9d8b4bc1 100644
--- a/SRC/dHWPM_CombBLAS.hpp
+++ b/SRC/dHWPM_CombBLAS.hpp
@@ -73,7 +73,7 @@ dGetHWPM(SuperMatrix *A, gridinfo_t *grid, dScalePermstruct_t *ScalePermstruct)
     {
         printf("HWPM only supports square process grid. Retuning without a permutation.\n");
     }
-    combblas::SpParMat < int_t, double, combblas::SpDCCols<int_t,double> > Adcsc;
+    combblas::SpParMat < int_t, double, combblas::SpDCCols<int_t,double> > Adcsc(grid->comm);
     std::vector< std::vector < std::tuple<int_t,int_t,double> > > data(procs);
     
     /* ------------------------------------------------------------
diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c
index bb842852..cdf0dc21 100644
--- a/SRC/dbinary_io.c
+++ b/SRC/dbinary_io.c
@@ -18,6 +18,7 @@ dread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz,
     nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
     printf("# of doubles fread: %d\n", nnz_read);
     fclose(fp);
+    return 0;
 }
 
 int
@@ -27,7 +28,7 @@ dwrite_binary(int_t n, int_t nnz,
       FILE  *fp1;
       int nnz_written;
       size_t isize = sizeof(int_t), dsize = sizeof(double);
-      fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+      fp1 = fopen("matrix.bin", "wb");
       fwrite(&n, isize, 1, fp1);
       fwrite(&nnz, isize, 1, fp1);
       fwrite(colptr, isize, n+1, fp1);
@@ -37,4 +38,5 @@ dwrite_binary(int_t n, int_t nnz,
       printf("dump binary file ... # of double fwrite: %d\n", nnz_written);
       assert(nnz_written==nnz);
       fclose(fp1);
+      return 0;
 }
diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c
new file mode 100644
index 00000000..ef9d6da6
--- /dev/null
+++ b/SRC/dcommunication_aux.c
@@ -0,0 +1,503 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_ddefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t dIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], MPI_DOUBLE, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t dIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], MPI_DOUBLE,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     double* uval, gridinfo_t *grid,
+		    int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], MPI_DOUBLE, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, double* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, dLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t dIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, double* Uval_buf, dLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t dWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, MPI_DOUBLE, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, MPI_DOUBLE, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, dLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    double *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(double) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, MPI_DOUBLE, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, MPI_DOUBLE, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, MPI_DOUBLE, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, MPI_DOUBLE, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, MPI_DOUBLE, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t dUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/dgather.c b/SRC/dgather.c
new file mode 100644
index 00000000..0c618e42
--- /dev/null
+++ b/SRC/dgather.c
@@ -0,0 +1,398 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Various gather routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include <stdio.h>
+#include "superlu_ddefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t dprintMatrix(char*s, int n, int m, double* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; i<n; i++)
+    {
+        for(int j =0; j<m; j++)
+        {
+            printf("%g ", A[j*LDA +i]);
+        }
+        printf("\n");
+    }
+    return 0;
+}
+
+void dgather_u(int_t num_u_blks,
+                Ublock_info_t *Ublock_info, int_t * usub,
+              	double *uval, double *bigU, int_t ldu,
+              	int_t *xsup, int_t klst                /* for SuperSize */
+             )
+{
+    // return;
+    //  private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,lead_zero, \
+    // jj, i)
+    double zero = 0.0;
+
+#ifdef _OPENMP    
+#pragma omp parallel for default (shared) schedule(dynamic)
+#endif
+    for (int_t j = 0; j < num_u_blks; ++j)
+    {
+        double *tempu;
+        if (j == 0) tempu = bigU;
+        else tempu = bigU + ldu * Ublock_info[j - 1].full_u_cols;
+
+        int_t iukp = Ublock_info[j].iukp ;
+        int_t rukp = Ublock_info[j].rukp ;
+        int_t jb = Ublock_info[j].jb ;
+        int_t nsupc = SuperSize(jb);
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            int_t segsize = klst - usub[jj];
+            if ( segsize )
+            {
+                int_t lead_zero = ldu - segsize;
+                for (int_t i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (int_t i = 0; i < segsize; ++i)
+                {
+                    // printf("%d %d %d\n",i,rukp,segsize );
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    }
+}
+
+
+void dgather_l( int_t num_LBlk, int_t knsupc,
+               Remain_info_t *L_info,
+               double * lval, int_t LD_lval, double * L_buff )
+{
+    if (num_LBlk < 1)
+    {
+        return;
+    }
+
+    int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
+#pragma omp parallel for
+#endif
+    for (int_t i = 0; i < num_LBlk; ++i)
+    {
+        int_t StRowDest  = 0;
+        int_t temp_nbrow;
+        if (i == 0)
+        {
+            temp_nbrow = L_info[0].FullRow;
+        }
+        else
+        {
+            StRowDest   = L_info[i - 1].FullRow;
+            temp_nbrow  = L_info[i].FullRow - L_info[i - 1].FullRow;
+        }
+
+        int_t StRowSource = L_info[i].StRow;
+#if 0
+        LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', temp_nbrow, knsupc, &lval[StRowSource], LD_lval, &L_buff[StRowDest], LD_LBuff);
+#else  /* Sherry */
+	for (int j = 0; j < knsupc; ++j) {
+            memcpy( &L_buff[StRowDest + j * LD_LBuff], 
+	            &lval[StRowSource + j * LD_lval],
+	            temp_nbrow * sizeof(double) );
+        }
+#endif
+    } /* end for i ... */
+}
+
+// Rearragnes and gatehrs L blocks
+void dRgather_L( int_t k, int_t *lsub, double *lusup,
+                gEtreeInfo_t* gEtreeInfo, Glu_persist_t *Glu_persist,
+                gridinfo_t *grid, HyP_t *HyP, int_t *myIperm, int_t *iperm_c_supno )
+{
+
+    int_t temp_nbrow;
+    int_t cum_nrow = 0;
+    int_t ib;
+    int_t *xsup = Glu_persist->xsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    dgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    dgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // dprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // dprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void dRgather_U( int_t k, int_t jj0, int_t *usub,	double *uval,
+                 double *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    dgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    dgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* dRgather_U */
diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h
new file mode 100644
index 00000000..d439bd2d
--- /dev/null
+++ b/SRC/dlustruct_gpu.h
@@ -0,0 +1,238 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * </pre>
+ */
+
+#pragma once // so that this header file is included onle once
+
+#include "superlu_ddefs.h"
+
+#ifdef GPU_ACC // enable GPU
+#include "gpublas_utils.h"
+
+// #include "mkl.h"
+// #include "sec_structs.h"
+// #include "supernodal_etree.h"
+
+/* Constants */
+//#define SLU_TARGET_GPU 0
+//#define MAX_BLOCK_SIZE 10000
+#define MAX_NGPU_STREAMS 32
+
+static
+void check(gpuError_t result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
+                file, line, gpuGetErrorString(result), func);
+
+        // Make sure we call GPU Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define checkGPUErrors(val)  check ( (val), #val, __FILE__, __LINE__ )
+
+typedef struct //SCUbuf_gpu_
+{
+    /*Informations for various buffers*/
+    double *bigV;
+    double *bigU;
+    double *bigU_host;      /*pinned location*/
+    int_t *indirect;        /*for indirect address calculations*/
+    int_t *indirect2;       /*for indirect address calculations*/
+
+    double *Remain_L_buff;  /* on GPU */
+    double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
+    
+    int_t *lsub;
+    int_t *usub;
+
+    int_t *lsub_buf, *usub_buf;
+    
+    Ublock_info_t *Ublock_info; /* on GPU */
+    Remain_info_t *Remain_info;
+    Ublock_info_t *Ublock_info_host;
+    Remain_info_t *Remain_info_host;
+
+    int_t* usub_IndirectJ3;  /* on GPU */
+    int_t* usub_IndirectJ3_host;
+
+} dSCUbuf_gpu_t;
+
+/* Holds the L & U data structures on the GPU side */
+typedef struct //LUstruct_gpu_ 
+{
+    int_t   *LrowindVec;      /* A single vector */
+    int_t   *LrowindPtr;      /* A single vector */
+
+    double  *LnzvalVec;       /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
+
+    int_t   *UrowindVec;            /* A single vector */
+    int_t   *UrowindPtr;            /* A single vector */
+    int_t   *UrowindPtr_host;       /* A single vector */
+    int_t   *UnzvalPtr_host;
+
+    double  *UnzvalVec;       /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
+    /*gpu pointers for easy block accesses */
+    local_l_blk_info_t *local_l_blk_infoVec;
+    int_t *local_l_blk_infoPtr;
+    int_t *jib_lookupVec;
+    int_t *jib_lookupPtr;
+    local_u_blk_info_t *local_u_blk_infoVec;
+
+    int_t *local_u_blk_infoPtr;
+    int_t *ijb_lookupVec;
+    int_t *ijb_lookupPtr;
+
+    // GPU buffers for performing Schur Complement Update on GPU
+    dSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
+    double *acc_L_buff, *acc_U_buff;
+
+    /*Informations for various buffers*/
+    int_t buffer_size;      /**/
+    int_t nsupers;  /*should have number of supernodes*/
+    int_t *xsup;
+    gridinfo_t *grid;
+
+    double ScatterMOPCounter;
+    double ScatterMOPTimer;
+    double GemmFLOPCounter;
+    double GemmFLOPTimer;
+
+    double cPCIeH2D;
+    double cPCIeD2H;
+    double tHost_PCIeH2D;
+    double tHost_PCIeD2H;
+
+    /*gpu events to measure DGEMM and SCATTER timing */
+    int *isOffloaded;  /*stores if any iteration is offloaded or not*/
+    gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd;  /*gpu events to store gemm and scatter's begin and end*/
+    gpuEvent_t *ePCIeH2D;
+    gpuEvent_t *ePCIeD2H_Start;
+    gpuEvent_t *ePCIeD2H_End;
+
+    int_t *xsup_host;
+    int_t* perm_c_supno;
+    int_t first_l_block_gpu, first_u_block_gpu;
+} dLUstruct_gpu_t;
+
+typedef struct //sluGPU_t_
+{
+    int_t gpuId;        // if there are multiple GPUs
+    dLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
+    gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
+    gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
+    int_t lastOffloadStream[MAX_NGPU_STREAMS];
+    int_t nGPUStreams;
+    int* isNodeInMyGrid;
+    double acc_async_cost;
+} dsluGPU_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int dsparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    dscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    dLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    ddiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    dsluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, 
+    SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info);
+
+int dinitD2Hreduce(
+    int next_k,
+    d2Hreduce_t* d2Hred,
+    int last_flag,
+    // int_t *perm_c_supno,
+    HyP_t* HyP,
+    dsluGPU_t *sluGPU,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct, SCT_t* SCT
+);
+
+extern int dreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
+   	dsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
+	 dLUstruct_t *LUstruct);
+
+extern int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT);
+extern int dsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, dsluGPU_t *sluGPU);
+extern int dsendSCUdataHost2GPU(
+    int_t streamId, int_t* lsub, int_t* usub, double* bigU, int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,  dsluGPU_t *sluGPU, HyP_t* HyP
+);
+
+extern int dinitSluGPU3D_t(
+    dsluGPU_t *sluGPU,
+    dLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt
+);
+int dSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    dsluGPU_t *sluGPU, gridinfo_t *grid
+);
+
+
+extern void dCopyLUToGPU3D (int* isNodeInMyGrid, dLocalLU_t *A_host,
+           dsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
+	   gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt);
+
+extern int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+                              int_t** treePerm,    dLUValSubBuf_t*LUvsb,
+                              dLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
+                              dsluGPU_t *sluGPU,  d2Hreduce_t* d2Hred,
+                              factStat_t *factStat, HyP_t* HyP, SCT_t* SCT );
+
+extern void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT);
+extern int dfree_LUstruct_gpu (dLUstruct_gpu_t *A_gpu);
+
+//int freeSluGPU(dsluGPU_t *sluGPU);
+
+extern void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda );
+
+/*to print out various statistics*/
+void dprintGPUStats(dLUstruct_gpu_t *A_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // matching: enable GPU
diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c
index 9d047273..5b722ac9 100644
--- a/SRC/dmemory_dist.c
+++ b/SRC/dmemory_dist.c
@@ -170,8 +170,6 @@ double *doubleCalloc_dist(int_t n)
     return (buf);
 }
 
-#if 0  //////////////// Sherry
-
 /***************************************
  * The following are from 3D code.
  ***************************************/
@@ -286,4 +284,3 @@ void d3D_printMemUse( trf3Dpartition_t*  trf3Dpartition,  dLUstruct_t *LUstruct,
     }
 }
 
-#endif
diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
new file mode 100644
index 00000000..97dac6a6
--- /dev/null
+++ b/SRC/dnrformat_loc3d.c
@@ -0,0 +1,575 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void dGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ double *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	double *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	double *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+		    b_counts_int, b_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    double *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    double *Btmp; // on 3D grid
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
+		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_DOUBLE,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* dScatter_B3d */
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
index 53b02ab0..8be6ba3c 100644
--- a/SRC/dreadMM.c
+++ b/SRC/dreadMM.c
@@ -61,7 +61,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
@@ -108,7 +108,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 3/ Read n and nnz */
 #ifdef _LONGINT
-    sscanf(line, "%ld%ld%ld",m, n, nonz);
+    sscanf(line, "%lld%lld%lld", m, n, nonz);
 #else
     sscanf(line, "%d%d%d",m, n, nonz);
 #endif
diff --git a/SRC/dreadtriple.c b/SRC/dreadtriple.c
index 8053e69a..523e2596 100644
--- a/SRC/dreadtriple.c
+++ b/SRC/dreadtriple.c
@@ -47,7 +47,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
 #ifdef _LONGINT
-    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+    fscanf(fp, "%lld%lld%lld", m, n, nonz);
 #else
     fscanf(fp, "%d%d%d", m, n, nonz);
 #endif
@@ -77,7 +77,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 
 #ifdef _LONGINT
-        fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+        fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
 #else // int 
         fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
 #endif
@@ -86,8 +86,9 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
-	    } else
+	    } else {
 		printf("triplet file: row/col indices are one-based.\n");
+     	    }
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c
index 09004af4..fce23583 100644
--- a/SRC/dreadtriple_noheader.c
+++ b/SRC/dreadtriple_noheader.c
@@ -49,9 +49,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     nz = *n = 0;
 
 #ifdef _LONGINT
-    ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+    ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali);
 #else  // int
-    ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali);
+    ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
 #endif
 
     while (ret_val != EOF) {
@@ -62,9 +62,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	++nz;
 
 #ifdef _LONGINT
-        ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+        ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali);
 #else  // int
-        ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali);
+        ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
 #endif
     }
     
@@ -105,7 +105,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* Read into the triplet array from a file */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+	fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
 #else // int32
 	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
 #endif
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
index c167e19e..dbddd550 100644
--- a/SRC/dscatter.c
+++ b/SRC/dscatter.c
@@ -306,7 +306,7 @@ gemm_division_cpu_gpu(
 )
 {
     int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
@@ -433,7 +433,7 @@ gemm_division_new (int * num_streams_used,   /*number of streams that will be us
     )
 {
     int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c
new file mode 100644
index 00000000..53af7944
--- /dev/null
+++ b/SRC/dscatter3d.c
@@ -0,0 +1,625 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, double *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+dblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int ldl,
+                    double *U_mat, int ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    double alpha = 1.0, beta = 0.0;
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        dscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+dblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling DGEMM */
+    superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        dscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 dLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dgemm_scatterTopLeft */
+
+int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  dLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dblock_gemm_scatterTopRight */
+
+int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    dLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomLeft */
+
+int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     dLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
new file mode 100644
index 00000000..9b4cb79f
--- /dev/null
+++ b/SRC/dsuperlu_blas.c
@@ -0,0 +1,123 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_ddefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a,
+                  int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    dgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return dgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    dtrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return dtrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    dger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_dscal(const int n, const double alpha, double *x, const int incx)
+{
+    dscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_daxpy(const int n, const double alpha,
+    const double *x, const int incx, double *y, const int incy)
+{
+    daxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_dgemv(const char *trans, const int m,
+                  const int n, const double alpha, const double *a,
+                  const int lda, const double *x, const int incx,
+                  const double beta, double *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                  int n, double *a, int lda, double *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
new file mode 100644
index 00000000..51a5b7b4
--- /dev/null
+++ b/SRC/dsuperlu_gpu.cu
@@ -0,0 +1,1780 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * </pre>
+ */
+
+//#define GPU_DEBUG
+
+#include "superlu_defs.h"
+
+#undef Reduce
+
+//#include <thrust/system/gpu/detail/cub/cub.cuh>
+
+#include "dlustruct_gpu.h"
+
+
+//extern "C" {
+//	void cblas_daxpy(const int N, const double alpha, const double *X,
+//	                 const int incX, double *Y, const int incY);
+//}
+
+// gpublasStatus_t checkGPUblas(gpublasStatus_t result)
+// {
+// #if defined(DEBUG) || defined(_DEBUG)
+// 	if (result != GPUBLAS_STATUS_SUCCESS)
+// 	{
+// 		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result));
+// 		assert(result == GPUBLAS_STATUS_SUCCESS);
+// 	}
+// #endif
+// 	return result;
+// }
+
+
+// #define UNIT_STRIDE
+
+#if 0  ////////// this routine is not used anymore
+__device__ inline
+void device_scatter_l (int_t thread_id,
+                       int_t nsupc, int_t temp_nbrow,
+                       int_t *usub, int_t iukp, int_t klst,
+                       double *nzval, int_t ldv,
+                       double *tempv, int_t nbrow,
+                       // int_t *indirect2_thread
+                       int *indirect2_thread
+                      )
+{
+
+
+	int_t segsize, jj;
+
+	for (jj = 0; jj < nsupc; ++jj)
+	{
+		segsize = klst - usub[iukp + jj];
+		if (segsize)
+		{
+			if (thread_id < temp_nbrow)
+			{
+
+#ifndef UNIT_STRIDE
+				nzval[indirect2_thread[thread_id]] -= tempv[thread_id];
+#else
+				nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/
+#endif
+			}
+			tempv += nbrow;
+		}
+		nzval += ldv;
+	}
+}
+#endif ///////////// not used
+
+//#define THREAD_BLOCK_SIZE  256  /* Sherry: was 192. should be <= MAX_SUPER_SIZE */
+
+__device__ inline
+void ddevice_scatter_l_2D (int thread_id,
+                          int nsupc, int temp_nbrow,
+                          int_t *usub, int iukp, int_t klst,
+                          double *nzval, int ldv,
+                          const double *tempv, int nbrow,
+                          int *indirect2_thread,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+    if ( thread_id < temp_nbrow * ColPerBlock )	{
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#define UNROLL_ITER 8
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+   	    i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x];
+	    nzval[i] -= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+/* Sherry: this routine is not used */
+#if 0 //////////////////////////////////////////////
+__global__
+void cub_scan_test(void)
+{
+	int thread_id = threadIdx.x;
+	typedef cub::BlockScan<int, MAX_SUPER_SIZE > BlockScan; /*1D int data type*/
+
+	__shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+	__shared__ int IndirectJ1[MAX_SUPER_SIZE];
+	__shared__ int IndirectJ2[MAX_SUPER_SIZE];
+
+	if (thread_id < MAX_SUPER_SIZE)
+	{
+		IndirectJ1[thread_id] = (thread_id + 1) % 2;
+	}
+
+	__syncthreads();
+	if (thread_id < MAX_SUPER_SIZE)
+		BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+	if (thread_id < MAX_SUPER_SIZE)
+		printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+}
+#endif  /////////////////////////////////// not used
+
+
+__device__ inline
+void device_scatter_u_2D (int thread_id,
+                          int temp_nbrow,  int nsupc,
+                          double * ucol,
+                          int_t * usub, int iukp,
+                          int_t ilst, int_t klst,
+                          int_t * index, int iuip_lib,
+                          double * tempv, int nbrow,
+                          int *indirect,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ1,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+
+    if ( thread_id < temp_nbrow * ColPerBlock )
+    {    
+	/* 1D threads are logically arranged in 2D shape. */
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+           i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x];
+	    ucol[i] -= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+__global__
+void Scatter_GPU_kernel(
+    int_t streamId,
+    int_t ii_st, int_t ii_end,
+    int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */
+    int_t klst,
+    int_t jj0,   /* 0 on entry */
+    int_t nrows, int_t ldt, int_t npcol, int_t nprow,
+    dLUstruct_gpu_t * A_gpu)
+{
+
+	/* initializing pointers */
+	int_t *xsup = A_gpu->xsup;
+	int_t *UrowindPtr = A_gpu->UrowindPtr;
+	int_t *UrowindVec = A_gpu->UrowindVec;
+	int_t *UnzvalPtr = A_gpu->UnzvalPtr;
+	double *UnzvalVec = A_gpu->UnzvalVec;
+	int_t *LrowindPtr = A_gpu->LrowindPtr;
+	int_t *LrowindVec = A_gpu->LrowindVec;
+	int_t *LnzvalPtr = A_gpu->LnzvalPtr;
+	double *LnzvalVec = A_gpu->LnzvalVec;
+	double *bigV = A_gpu->scubufs[streamId].bigV;
+	local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec;
+	local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec;
+	int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr;
+	int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info;
+	int_t *lsub  = A_gpu->scubufs[streamId].lsub;
+	int_t *usub  = A_gpu->scubufs[streamId].usub;
+
+	/* thread block assignment: this thread block is
+	   assigned to block (lb, j) in 2D grid */
+	int lb = blockIdx.x + ii_st;
+	int j  = blockIdx.y + jj_st;
+	
+	extern __shared__ int s[];
+	int* indirect_lptr = s;  /* row-wise */
+	int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */
+	int* IndirectJ1= (int*) &indirect2_thread[ldt];    /* column-wise */
+	int* IndirectJ3= (int*) &IndirectJ1[ldt];    /* column-wise */
+	//int THREAD_BLOCK_SIZE =ldt; 
+	
+	int* pfxStorage = (int*) &IndirectJ3[ldt];
+	
+	int thread_id = threadIdx.x;
+
+	int iukp = Ublock_info[j].iukp;
+	int jb = Ublock_info[j].jb;
+	int nsupc = SuperSize (jb);
+	int ljb = jb / npcol;
+
+	typedef int pfx_dtype ;
+        extern  __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n);
+
+	double *tempv1;
+	if (jj_st == jj0)
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + Ublock_info[j - 1].full_u_cols * nrows;
+	}
+	else
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + (Ublock_info[j - 1].full_u_cols -
+		                   Ublock_info[jj_st - 1].full_u_cols) * nrows;
+	}
+
+	/* # of nonzero columns in block j  */
+	int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols
+	               : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols);
+	int cum_ncol = (j == 0) ? 0	
+					: Ublock_info[j - 1].full_u_cols;
+
+	int lptr = Remain_info[lb].lptr;
+	int ib   = Remain_info[lb].ib;
+	int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */
+	lptr += LB_DESCRIPTOR;
+
+	int_t cum_nrow;
+	if (ii_st == 0)
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+	}
+	else
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow);
+	}
+
+	tempv1 += cum_nrow;
+
+	if (ib < jb)  /*scatter U code */
+	{
+		int ilst = FstBlockC (ib + 1);
+		int lib =  ib / nprow;   /* local index of row block ib */
+		int_t *index = &UrowindVec[UrowindPtr[lib]];
+
+		int num_u_blocks = index[0];
+
+		int ljb = (jb) / npcol; /* local index of column block jb */
+
+		/* Each thread is responsible for one block column */
+		__shared__ int ljb_ind;
+		/*do a search ljb_ind at local row lib*/
+		int blks_per_threads = CEILING(num_u_blocks, blockDim.x);
+		// printf("blockDim.x =%d \n", blockDim.x);
+		
+		for (int i = 0; i < blks_per_threads; ++i)
+			/* each thread is assigned a chunk of consecutive U blocks to search */
+		{
+			/* only one thread finds the block index matching ljb */
+			if (thread_id * blks_per_threads + i < num_u_blocks &&
+			        local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb)
+			{
+				ljb_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip;
+		int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip;
+		iuip_lib += UB_DESCRIPTOR;
+		double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]];
+		double *ucol = &Unzval_lib[ruip_lib];
+
+		if (thread_id < temp_nbrow) /* row-wise */
+		{
+		    /* cyclically map each thread to a row */
+		    indirect_lptr[thread_id] = (int) lsub[lptr + thread_id];
+		}
+
+		/* column-wise: each thread is assigned one column */
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		/* indirectJ3[j] == kk means the j-th nonzero segment
+		   points to column kk in this supernode */
+
+		__syncthreads();
+
+		/* threads are divided into multiple columns */
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		// if (thread_id < blockDim.x)
+		// 	IndirectJ1[thread_id] = 0;
+		if (thread_id < ldt)
+			IndirectJ1[thread_id] = 0;
+
+		if (thread_id < blockDim.x)
+		{
+		    if (thread_id < nsupc)
+		    {
+			/* fstnz subscript of each column in the block */
+			IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst;
+		    }
+		}
+
+		/* perform an inclusive block-wide prefix sum among all threads */
+		__syncthreads();
+		
+		incScan(IndirectJ1, pfxStorage, nsupc);
+		
+		__syncthreads();
+
+		device_scatter_u_2D (
+		    thread_id,
+		    temp_nbrow,  nsupc,
+		    ucol,
+		    usub, iukp,
+		    ilst, klst,
+		    index, iuip_lib,
+		    tempv1, nrows,
+		    indirect_lptr,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ1,
+		    IndirectJ3 );
+
+	}
+	else     /* ib >= jb, scatter L code */
+	{
+
+		int rel;
+		double *nzval;
+		int_t *index = &LrowindVec[LrowindPtr[ljb]];
+		int num_l_blocks = index[0];
+		int ldv = index[1];
+
+		int fnz = FstBlockC (ib);
+		int lib = ib / nprow;
+
+		__shared__ int lib_ind;
+		/*do a search lib_ind for lib*/
+		int blks_per_threads = CEILING(num_l_blocks, blockDim.x);
+		for (int i = 0; i < blks_per_threads; ++i)
+		{
+			if (thread_id * blks_per_threads + i < num_l_blocks &&
+			        local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib)
+			{
+				lib_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj;
+		int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj;
+		lptrj += LB_DESCRIPTOR;
+		int dest_nbrow = index[lptrj - 1];
+
+		if (thread_id < dest_nbrow)
+		{
+		    rel = index[lptrj + thread_id] - fnz;
+		    indirect_lptr[rel] = thread_id;
+		}
+		__syncthreads();
+
+		/* can be precalculated */
+		if (thread_id < temp_nbrow)
+		{
+			rel = lsub[lptr + thread_id] - fnz;
+			indirect2_thread[thread_id] = indirect_lptr[rel];
+		}
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		__syncthreads();
+
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj;
+		ddevice_scatter_l_2D(
+		    thread_id,
+		    nsupc, temp_nbrow,
+		    usub, iukp, klst,
+		    nzval, ldv,
+		    tempv1, nrows, indirect2_thread,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ3);
+	} /* end else ib >= jb */
+
+} /* end Scatter_GPU_kernel */
+
+
+#define GPU_2D_SCHUDT  /* Not used */
+
+int dSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */
+    int_t nub,    /* jj_cpu on entry, pointing to the end of the Phi part */
+    int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,    /* num_u_blks_hi */
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    dsluGPU_t *sluGPU, gridinfo_t *grid
+)
+{
+    int SCATTER_THREAD_BLOCK_SIZE=512;
+
+	dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu;
+	dLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu;
+	int_t nprow = grid->nprow;
+	int_t npcol = grid->npcol;
+
+	gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId];
+	gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId];
+	int_t * lsub = A_gpu->scubufs[streamId].lsub_buf;
+	int_t * usub = A_gpu->scubufs[streamId].usub_buf;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host;
+	double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host;
+	double * bigU = A_gpu->scubufs[streamId].bigU_host;
+
+	A_gpu->isOffloaded[k0] = 1;
+	/* start by sending data to  */
+	int_t *xsup = A_gpu->xsup_host;
+	int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols;
+	// if(nub<1) return;
+	int_t ncols  = Ublock_info[nub - 1].full_u_cols - col_back;
+
+	/* Sherry: can get max_super_size from sp_ienv(3) */
+	int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment
+	int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far
+	int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the
+					 j-th nonzero segment points
+					 to column k in this supernode */
+	/* calculate usub_indirect */
+	for (int jj = jj_cpu; jj < nub; ++jj)
+	{
+	    int_t iukp = Ublock_info[jj].iukp;
+	    int_t jb = Ublock_info[jj].jb;
+	    int_t nsupc = SuperSize (jb);
+	    int_t addr = (jj == 0) ? 0
+	             : Ublock_info[jj - 1].full_u_cols - col_back;
+
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	indirectJ1[kk] = 0;
+	    }
+
+	    for (int_t kk = 0; kk < nsupc; ++kk)
+	    {
+	 	indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1;
+	    }
+
+	    /*prefix sum - indicates # of nonzero segments up to column kk */
+	    indirectJ2[0] = indirectJ1[0];
+	    for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	 	indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk];
+	    }
+
+	    /* total number of nonzero segments in this supernode */
+	    int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE
+
+	    /* compactation */
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	if (indirectJ1[kk]) /* kk is a nonzero segment */
+		{
+		    /* indirectJ3[j] == kk means the j-th nonzero segment
+		       points to column kk in this supernode */
+		    indirectJ3[indirectJ2[kk] - 1] = kk;
+		}
+	    }
+
+    	    for (int i = 0; i < nnz_col; ++i)
+	    {
+	        /* addr == total # of full columns before current block jj */
+		A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i];
+	    }
+	} /* end for jj ... calculate usub_indirect */
+
+	//printf("dSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout);
+
+	/*sizeof RemainLbuf = Rnbuf*knsupc */
+	double tTmp = SuperLU_timer_();
+	gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream);
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3,
+	                          A_gpu->scubufs[streamId].usub_IndirectJ3_host,
+	                          ncols * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff,
+	                          Remain_lbuf_send_size * sizeof(double),
+	                          gpuMemcpyHostToDevice, FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU,
+	                          bigu_send_size * sizeof(double),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info,
+	                          RemainBlk * sizeof(Remain_info_t),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info,
+	                          mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub,
+	                          lsub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub,
+	                          usub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp;
+	A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double)
+	                   + bigu_send_size * sizeof(double)
+	                   + RemainBlk * sizeof(Remain_info_t)
+	                   + mcb * sizeof(Ublock_info_t)
+	                   + lsub_len * sizeof(int_t)
+	                   + usub_len * sizeof(int_t);
+
+	double alpha = 1.0, beta = 0.0;
+
+	int_t ii_st  = 0;
+	int_t ii_end = 0;
+	int_t maxGemmBlockDim = (int) sqrt(buffer_size);
+	// int_t maxGemmBlockDim = 8000;
+
+	/* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that
+	   fits in the buffer_size  */
+	while (ii_end < RemainBlk) {
+    	    ii_st = ii_end;
+	    ii_end = RemainBlk;
+	    int_t nrow_max = maxGemmBlockDim;
+// nrow_max = Rnbrow;
+	    int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow;
+	    nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows,  nrow_max) : nrow_max;
+
+	    int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow;
+	    for (int_t i = ii_st; i < RemainBlk - 1; ++i)
+    	    {
+		if ( Remain_info[i + 1].FullRow > ResRow + nrow_max)
+		{
+		    ii_end = i;
+		    break;  /* row dimension reaches nrow_max */
+		}
+	    }
+
+	    int_t nrows;   /* actual row dimension for GEMM */
+	    int_t st_row;
+	    if (ii_st > 0)
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow;
+		st_row = Remain_info[ii_st - 1].FullRow;
+	    }
+	    else
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow;
+		st_row = 0;
+	    }
+
+	    int jj_st = jj_cpu;
+	    int jj_end = jj_cpu;
+
+	    while (jj_end < nub && nrows > 0 )
+	    {
+		int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols;
+		if ( remaining_cols * nrows < buffer_size)
+		{
+			jj_st = jj_end;
+			jj_end = nub;
+		}
+		else  /* C matrix cannot fit in buffer, need to break into pieces */
+		{
+		    int_t ncol_max = buffer_size / nrows;
+		    /** Must revisit **/
+		    ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim);
+		    ncol_max = (remaining_cols / ncol_max) > 0 ?
+		           remaining_cols / CEILING(remaining_cols,  ncol_max)
+		           : ncol_max;
+
+		    jj_st = jj_end;
+		    jj_end = nub;
+
+		    int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols;
+		    for (int_t j = jj_st; j < nub - 1; ++j)
+		    {
+			if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max)
+			{
+				jj_end = j;
+				break;
+			}
+		    }
+	    	} /* end-if-else */
+
+		int ncols;
+		int st_col;
+		if (jj_st > 0)
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols;
+		    st_col = Ublock_info[jj_st - 1].full_u_cols;
+		    if (ncols == 0) exit(0);
+		}
+		else
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* none of the matrix dimension is zero. */
+		if (nrows > 0 && ldu > 0 && ncols > 0)
+		{
+		    if (nrows * ncols > buffer_size) {
+			printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n",
+			       nrows, ncols, buffer_size);
+			fflush(stdout);
+		    }
+		    assert(nrows * ncols <= buffer_size);
+		    gpublasSetStream(gpublas_handle0, FunCallStream);
+		    gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream);
+		    gpublasDgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N,
+		            nrows, ncols, ldu, &alpha,
+		            &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow,
+		            &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu,
+		            &beta, A_gpu->scubufs[streamId].bigV, nrows);
+
+// #define SCATTER_OPT
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchronous
+#endif
+		    gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream);
+
+		    A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu;
+
+		    /*
+		     * Scattering the output
+		     */
+		     // dim3 dimBlock(THREAD_BLOCK_SIZE);   // 1d thread
+		    dim3 dimBlock(ldt);   // 1d thread
+
+		    dim3 dimGrid(ii_end - ii_st, jj_end - jj_st);
+
+		    Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>>
+			(streamId, ii_st, ii_end,  jj_st, jj_end, klst,
+			 0, nrows, ldt, npcol, nprow, dA_gpu);
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchrnous
+#endif
+
+		    gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream);
+
+		    A_gpu->ScatterMOPCounter +=  3.0 * (double) nrows * ncols;
+		} /* endif ... none of the matrix dimension is zero. */
+
+	    } /* end while jj_end < nub */
+
+	} /* end while (ii_end < RemainBlk) */
+
+	return 0;
+} /* end dSchurCompUpdate_GPU */
+
+
+static void print_occupancy()
+{
+    int blockSize;   // The launch configurator returned block size
+    int minGridSize; /* The minimum grid size needed to achieve the
+    		        best potential occupancy  */
+
+    gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize,
+                                        Scatter_GPU_kernel, 0, 0);
+    printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize);
+}
+
+static void printDevProp(gpuDeviceProp devProp)
+{
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+	
+	printf("pciBusID:                      %d\n",  devProp.pciBusID);
+	printf("pciDeviceID:                   %d\n",  devProp.pciDeviceID);
+	printf("GPU Name:                      %s\n",  devProp.name);
+	printf("Total global memory:           %zu\n",  devProp.totalGlobalMem);
+	printf("Total free memory:             %zu\n",  mfree);
+	printf("Clock rate:                    %d\n",  devProp.clockRate);
+
+	return;
+}
+
+
+static size_t get_acc_memory ()
+{
+
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+#if 0
+	printf("Total memory %zu & free memory %zu\n", mtotal, mfree);
+#endif
+	return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu ();
+
+
+}
+
+int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu)
+{
+	/* Free the L data structure on GPU */
+	checkGPU(gpuFree(A_gpu->LrowindVec));
+	checkGPU(gpuFree(A_gpu->LrowindPtr));
+
+	checkGPU(gpuFree(A_gpu->LnzvalVec));
+	checkGPU(gpuFree(A_gpu->LnzvalPtr));
+	free(A_gpu->LnzvalPtr_host);
+	
+	/*freeing the pinned memory*/
+	int_t streamId = 0;
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host));
+
+	checkGPU(gpuFreeHost(A_gpu->acc_L_buff));
+	checkGPU(gpuFreeHost(A_gpu->acc_U_buff));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf));
+
+
+	SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE
+	SUPERLU_FREE(A_gpu->GemmStart);
+	SUPERLU_FREE(A_gpu->GemmEnd);
+	SUPERLU_FREE(A_gpu->ScatterEnd);
+	SUPERLU_FREE(A_gpu->ePCIeH2D);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_Start);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_End);
+
+	/* Free the U data structure on GPU */
+	checkGPU(gpuFree(A_gpu->UrowindVec));
+	checkGPU(gpuFree(A_gpu->UrowindPtr));
+
+	//free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated
+
+	checkGPU(gpuFree(A_gpu->UnzvalVec));
+	checkGPU(gpuFree(A_gpu->UnzvalPtr));
+
+	checkGPU(gpuFree(A_gpu->grid));
+
+	/* Free the Schur complement structure on GPU */
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info));
+
+	// checkGPU(gpuFree(A_gpu->indirect));
+	// checkGPU(gpuFree(A_gpu->indirect2));
+	checkGPU(gpuFree(A_gpu->xsup));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].usub));
+
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->jib_lookupVec));
+	checkGPU(gpuFree(A_gpu->jib_lookupPtr));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->ijb_lookupVec));
+	checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
+
+	return 0;
+}
+
+
+
+void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda )
+{
+	double *cPtr = (double *) malloc(sizeof(double) * lda * n);
+	checkGPU(gpuMemcpy( cPtr, dA,
+	                      lda * n * sizeof(double), gpuMemcpyDeviceToHost)) ;
+
+	int_t i, j;
+	printf( "\n %s\n", desc );
+	for ( i = 0; i < m; i++ )
+	{
+		for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] );
+		printf( "\n" );
+	}
+	free(cPtr);
+}
+
+void dprintGPUStats(dLUstruct_gpu_t * A_gpu)
+{
+	double tGemm = 0;
+	double tScatter = 0;
+	double tPCIeH2D = 0;
+	double tPCIeD2H = 0;
+
+	for (int_t i = 0; i < A_gpu->nsupers; ++i)
+	{
+	    float milliseconds = 0;
+
+	    if (A_gpu->isOffloaded[i])
+		{
+			gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]);
+			tPCIeH2D += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]);
+			tGemm += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]);
+			tScatter += 1e-3 * (double) milliseconds;
+		}
+
+		milliseconds = 0;
+		gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]);
+		tPCIeD2H += 1e-3 * (double) milliseconds;
+	}
+
+	printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n",
+	       A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm  );
+	printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n",
+	       A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter  );
+	printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D  );
+	printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H  );
+	fflush(stdout);
+
+} /* end printGPUStats */
+
+/* Initialize the GPU side of the data structure. */
+int dinitSluGPU3D_t(
+    dsluGPU_t *sluGPU, // LU structures on GPU, see dlustruct_gpu.h 
+    dLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno,
+    int_t n,
+    int_t buffer_size,    /* read from env variable MAX_BUFFER_SIZE */
+    int_t bigu_size,
+    int_t ldt             /* NSUP read from sp_ienv(3) */
+)
+{
+    (gpuDeviceReset ())     ;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int* isNodeInMyGrid = sluGPU->isNodeInMyGrid;
+
+    sluGPU->nGPUStreams = getnGPUStreams();
+    
+    int SCATTER_THREAD_BLOCK_SIZE = ldt; 
+    if(getenv("SCATTER_THREAD_BLOCK_SIZE"))
+    {
+	int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE"));
+	if(stbs>=ldt)
+	{
+	    SCATTER_THREAD_BLOCK_SIZE = stbs; 
+	}
+	
+    }
+    
+    if (grid3d->iam == 0)
+    {
+	printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams);
+	fflush(stdout);
+	printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE);
+	
+	if ( MAX_SUPER_SIZE < ldt )
+	{
+		ABORT("MAX_SUPER_SIZE smaller than requested NSUP");
+	}
+    }
+
+    gpuStreamCreate(&(sluGPU->CopyStream));
+
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+	gpuStreamCreate(&(sluGPU->funCallStreams[streamId]));
+	gpublasCreate(&(sluGPU->gpublasHandles[streamId]));
+	sluGPU->lastOffloadStream[streamId] = -1;
+    }
+
+    sluGPU->A_gpu = (dLUstruct_gpu_t *) malloc (sizeof(dLUstruct_gpu_t));
+    sluGPU->A_gpu->perm_c_supno = perm_c_supno;
+
+    /* Allocate GPU memory for the LU data structures, and copy
+       the host LU structure to GPU side.  */
+    dCopyLUToGPU3D ( isNodeInMyGrid,
+	        Llu,             /* referred to as A_host */
+	        sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt
+	);
+
+    return 0;
+} /* end dinitSluGPU3D_t */
+
+
+int dinitD2Hreduce(
+    int next_k,  d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP,
+    dsluGPU_t *sluGPU, gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t* SCT
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+
+
+    // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1);
+    // int_t next_k = perm_c_supno[next_col];  /* global block number for next colum*/
+    int_t mkcol, mkrow;
+    
+    int_t kljb = LBj( next_k, grid );   /*local block number for next block*/
+    int_t kijb = LBi( next_k, grid );   /*local block number for next block*/
+    
+    int_t *kindexL ;                     /*for storing index vectors*/
+    int_t *kindexU ;
+    mkrow = PROW (next_k, grid);
+    mkcol = PCOL (next_k, grid);
+    int_t ksup_size = SuperSize(next_k);
+    
+    int_t copyL_kljb = 0;
+    int_t copyU_kljb = 0;
+    int_t l_copy_len = 0;
+    int_t u_copy_len = 0;
+    
+    if (mkcol == mycol &&  Lrowind_bc_ptr[kljb] != NULL  && last_flag)
+    {
+	if (HyP->Lblock_dirty_bit[kljb] > -1)
+	    {
+		copyL_kljb = 1;
+		int_t lastk0 = HyP->Lblock_dirty_bit[kljb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+
+	kindexL = Lrowind_bc_ptr[kljb];
+	l_copy_len = kindexL[1] * ksup_size;
+    }
+
+    if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL    && last_flag )
+    {
+	if (HyP->Ublock_dirty_bit[kijb] > -1)
+	    {
+		copyU_kljb = 1;
+		int_t lastk0 = HyP->Ublock_dirty_bit[kijb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+	// copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0;
+	kindexU = Ufstnz_br_ptr[kijb];
+	u_copy_len = kindexU[1];
+    }
+
+    // wait for streams if they have not been finished
+    
+    // d2Hred->next_col = next_col;
+    d2Hred->next_k = next_k;
+    d2Hred->kljb = kljb;
+    d2Hred->kijb = kijb;
+    d2Hred->copyL_kljb = copyL_kljb;
+    d2Hred->copyU_kljb = copyU_kljb;
+    d2Hred->l_copy_len = l_copy_len;
+    d2Hred->u_copy_len = u_copy_len;
+    d2Hred->kindexU = kindexU;
+    d2Hred->kindexL = kindexL;
+    d2Hred->mkrow = mkrow;
+    d2Hred->mkcol = mkcol;
+    d2Hred->ksup_size = ksup_size;
+    return 0;
+} /* dinitD2Hreduce */
+
+int dreduceGPUlu(
+    int last_flag,
+    d2Hreduce_t* d2Hred,
+    dsluGPU_t *sluGPU,
+    SCT_t *SCT,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct
+)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    gpuStream_t CopyStream;
+    dLUstruct_gpu_t *A_gpu;
+    A_gpu = sluGPU->A_gpu;
+    CopyStream = sluGPU->CopyStream;
+
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t mkrow = d2Hred->mkrow;
+    int_t mkcol = d2Hred->mkcol;
+    int_t ksup_size = d2Hred->ksup_size;
+    int_t *kindex;
+    if ((copyL_kljb || copyU_kljb) && last_flag )
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(CopyStream);
+	    SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx;
+	}
+
+    double tt_start = SuperLU_timer_();
+
+    if (last_flag) {
+	if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL )
+	    {
+		kindex = Lrowind_bc_ptr[kljb];
+		int_t len = kindex[1];
+
+		if (copyL_kljb)
+		    {
+			double *nzval_host;
+			nzval_host = Lnzval_bc_ptr[kljb];
+			int_t llen = ksup_size * len;
+			double alpha = 1;
+			superlu_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1);
+		    }
+
+	    }
+    }
+    if (last_flag) {
+	if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL )
+	    {
+		kindex = Ufstnz_br_ptr[kijb];
+		int_t len = kindex[1];
+
+		if (copyU_kljb)
+		    {
+			double *nzval_host;
+			nzval_host = Unzval_br_ptr[kijb];
+
+			double alpha = 1;
+			superlu_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1);
+		    }
+	    }
+    }
+
+    double tt_end = SuperLU_timer_();
+    SCT->AssemblyTimer += tt_end - tt_start;
+    return 0;
+} /* dreduceGPUlu */
+
+
+int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT)
+{
+    double ttx = SuperLU_timer_();
+    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+    return 0;
+}
+
+int dsendLUpanelGPU2HOST(
+    int_t k0,
+    d2Hreduce_t* d2Hred,
+    dsluGPU_t *sluGPU
+)
+{
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t l_copy_len = d2Hred->l_copy_len;
+    int_t u_copy_len = d2Hred->u_copy_len;
+    gpuStream_t CopyStream = sluGPU->CopyStream;;
+    dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    double tty = SuperLU_timer_();
+    gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream);
+    if (copyL_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]],
+				  l_copy_len * sizeof(double), gpuMemcpyDeviceToHost, CopyStream ) );
+
+    if (copyU_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]],
+				  u_copy_len * sizeof(double), gpuMemcpyDeviceToHost, CopyStream ) );
+    gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream);
+    A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty;
+    A_gpu->cPCIeD2H += u_copy_len * sizeof(double) + l_copy_len * sizeof(double);
+
+    return 0;
+}
+
+/* Copy L and U panel data structures from host to the host part of the
+   data structures in A_gpu.
+   GPU is not involved in this routine. */
+int dsendSCUdataHost2GPU(
+    int_t streamId,
+    int_t* lsub,
+    int_t* usub,
+    double* bigU,
+    int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,
+    dsluGPU_t *sluGPU,
+    HyP_t* HyP
+)
+{
+    //{printf("....[enter] dsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+    int_t usub_len = usub[2];
+    int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+    //{printf("....[2] in dsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);}
+    dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len);
+    memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len);
+    memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info,
+	   sizeof(Remain_info_t)*HyP->RemainBlk);
+    memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi,
+	   sizeof(Ublock_info_t)*HyP->num_u_blks_Phi);
+    memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff,
+	   sizeof(double)*Remain_lbuf_send_size);
+    memcpy(A_gpu->scubufs[streamId].bigU_host, bigU,
+	   sizeof(double)*bigu_send_size);
+
+    return 0;
+}
+
+/* Sherry: not used ?*/
+#if 0
+int freeSluGPU(dsluGPU_t *sluGPU)
+{
+    return 0;
+}
+#endif
+
+/* Allocate GPU memory for the LU data structures, and copy
+   the host LU structure to GPU side.
+   After factorization, the GPU LU structure should be freed by
+   calling dfree_LUsstruct_gpu().    */
+void dCopyLUToGPU3D (
+    int* isNodeInMyGrid,
+    dLocalLU_t *A_host, /* distributed LU structure on host */
+    dsluGPU_t *sluGPU,  /* hold LU structure on GPU */
+    Glu_persist_t *Glu_persist, int_t n,
+    gridinfo3d_t *grid3d,
+    int_t buffer_size, /* bigV size on GPU for Schur complement update */
+    int_t bigu_size,
+    int_t ldt
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    dLUstruct_gpu_t * A_gpu =  sluGPU->A_gpu;
+    dLUstruct_gpu_t **dA_gpu =  &(sluGPU->dA_gpu);
+
+#if ( PRNTlevel>=1 )
+    if ( grid3d->iam == 0 ) print_occupancy();
+#endif
+
+#ifdef GPU_DEBUG
+    // if ( grid3d->iam == 0 )
+    {
+	gpuDeviceProp devProp;
+	gpuGetDeviceProperties(&devProp, 0);
+	printDevProp(devProp);
+    }
+#endif
+    int_t *xsup ;
+    xsup = Glu_persist->xsup;
+    int iam = grid->iam;
+    int nsupers = Glu_persist->supno[n - 1] + 1;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    int_t remain_l_max = A_host->bufmax[1];
+
+    /*copies of scalars for easy access*/
+    A_gpu->nsupers = nsupers;
+    A_gpu->ScatterMOPCounter = 0;
+    A_gpu->GemmFLOPCounter = 0;
+    A_gpu->cPCIeH2D = 0;
+    A_gpu->cPCIeD2H = 0;
+    A_gpu->tHost_PCIeH2D = 0;
+    A_gpu->tHost_PCIeD2H = 0;
+
+    /*initializing memory*/
+    size_t max_gpu_memory = get_acc_memory ();
+    size_t gpu_mem_used = 0;
+
+    void *tmp_ptr;
+
+    A_gpu->xsup_host = xsup;
+
+    int_t nGPUStreams = sluGPU->nGPUStreams;
+    /*pinned memory allocations.
+      Paged-locked memory by gpuMallocHost is accessible to the device.*/
+    for (int streamId = 0; streamId < nGPUStreams; streamId++ ) {
+	void *tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, (n) * sizeof(int_t) )) ;
+	A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  ( n) * sizeof(int_t) ));
+	A_gpu->scubufs[streamId].usub_IndirectJ3 =  (int_t*) tmp_ptr;
+	gpu_mem_used += ( n) * sizeof(int_t);
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  remain_l_max * sizeof(double) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  bigu_size * sizeof(double) )) ;
+	A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr;
+
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1])));
+	A_gpu->acc_L_buff = (double *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3])));
+	A_gpu->acc_U_buff = (double *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])));
+	A_gpu->scubufs[streamId].lsub_buf =  (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])));
+	A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  remain_l_max * sizeof(double) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr;
+	gpu_mem_used += remain_l_max * sizeof(double);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  bigu_size * sizeof(double) )) ;
+	A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr;
+	gpu_mem_used += bigu_size * sizeof(double);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr;
+	gpu_mem_used += mcb * sizeof(Ublock_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr;
+	gpu_mem_used += mrb * sizeof(Remain_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  buffer_size * sizeof(double))) ;
+	A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr;
+	gpu_mem_used += buffer_size * sizeof(double);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[0]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[0] * sizeof(int_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[2]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[2] * sizeof(int_t);
+	
+    } /* endfor streamID ... allocate paged-locked memory */
+
+    A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers);
+    A_gpu->GemmStart  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->GemmEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ScatterEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    
+    for (int i = 0; i < nsupers; ++i)
+	{
+	    A_gpu->isOffloaded[i] = 0;
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i])));
+	}
+
+    /*---- Copy L data structure to GPU ----*/
+
+    /*pointers and address of local blocks for easy accessibility */
+    local_l_blk_info_t  *local_l_blk_infoVec;
+    int_t  * local_l_blk_infoPtr;
+    local_l_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) );
+
+    /* First pass: count total L blocks */
+    int_t cum_num_l_blocks = 0;  /* total number of L blocks I own */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    /* going through each block column I own */
+
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0];
+		    cum_num_l_blocks += num_l_blocks;
+		}
+	}
+
+    /*allocating memory*/
+    local_l_blk_infoVec =  (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t));
+
+    /* Second pass: set up the meta-data for the L structure */
+    cum_num_l_blocks = 0;
+
+    /*initialzing vectors */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0]; /* # L blocks in this column */
+
+		    if (num_l_blocks > 0)
+			{
+
+			    local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks;
+			    local_l_blk_infoPtr[i] = cum_num_l_blocks;
+
+			    int_t lptrj = BC_HEADER;
+			    int_t luptrj = 0;
+
+			    for (int_t j = 0; j < num_l_blocks ; ++j)
+				{
+
+				    int_t ijb = index[lptrj];
+
+				    local_l_blk_info_i[j].lib = ijb / Pr;
+				    local_l_blk_info_i[j].lptrj = lptrj;
+				    local_l_blk_info_i[j].luptrj = luptrj;
+				    luptrj += index[lptrj + 1];
+				    lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+					
+				}
+			}
+		    cum_num_l_blocks += num_l_blocks;
+		}
+
+	} /* endfor all block columns */
+
+    /* Allocate L memory on GPU, and copy the values from CPU to GPU */
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  cum_num_l_blocks * sizeof(local_l_blk_info_t))) ;
+    A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr;
+    gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  CEILING(nsupers, Pc)*sizeof(int_t))) ;
+    A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr;
+    gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+    /*---- Copy U data structure to GPU ----*/
+
+    local_u_blk_info_t  *local_u_blk_infoVec;
+    int_t  * local_u_blk_infoPtr;
+    local_u_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) );
+
+    /* First pass: count total U blocks */
+    int_t cum_num_u_blocks = 0;
+
+    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+		    cum_num_u_blocks += num_u_blocks;
+
+		}
+	}
+
+	local_u_blk_infoVec =  (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t));
+
+	/* Second pass: set up the meta-data for the U structure */
+	cum_num_u_blocks = 0;
+
+	for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+
+		    if (num_u_blocks > 0)
+			{
+			    local_u_blk_info_t  *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks;
+			    local_u_blk_infoPtr[i] = cum_num_u_blocks;
+
+			    int_t iuip_lib, ruip_lib;
+			    iuip_lib = BR_HEADER;
+			    ruip_lib = 0;
+
+			    for (int_t j = 0; j < num_u_blocks ; ++j)
+				{
+
+				    int_t ijb = index[iuip_lib];
+				    local_u_blk_info_i[j].ljb = ijb / Pc;
+				    local_u_blk_info_i[j].iuip = iuip_lib;
+				    local_u_blk_info_i[j].ruip = ruip_lib;
+
+				    ruip_lib += index[iuip_lib + 1];
+				    iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+
+				}
+			}
+		    cum_num_u_blocks +=  num_u_blocks;
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  cum_num_u_blocks * sizeof(local_u_blk_info_t))) ;
+	A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr;
+	gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  CEILING(nsupers, Pr)*sizeof(int_t))) ;
+	A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr;
+	gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	/* Copy the actual L indices and values */
+	int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */
+	int_t *temp_LrowindPtr    = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *temp_LnzvalPtr     = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t l_ind_len = 0;
+	int_t l_val_len = 0;
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    temp_LrowindPtr[ljb] = l_ind_len;
+		    temp_LnzvalPtr[ljb] = l_val_len;        // ###
+		    Lnzval_size[ljb] = 0;       //###
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  = index_host[0];   /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+			    /* Global block number is mycol +  ljb*Pc */
+			    int_t nsupc = SuperSize(jb);
+
+			    l_ind_len += len1;
+			    l_val_len += len * nsupc;
+			    Lnzval_size[ljb] = len * nsupc ; // ###
+			}
+		    else
+			{
+			    Lnzval_size[ljb] = 0 ; // ###
+			}
+		}
+	} /* endfor jb = 0 ... */
+
+	/* Copy the actual U indices and values */
+	int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	int_t *temp_UrowindPtr    = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *temp_UnzvalPtr     = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t u_ind_len = 0;
+	int_t u_val_len = 0;
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+	    temp_UrowindPtr[lb] = u_ind_len;
+	    temp_UnzvalPtr[lb] = u_val_len;
+	    Unzval_size[lb] = 0;
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len = index_host[1];
+		    int_t len1 = index_host[2];
+		    
+		    u_ind_len += len1;
+		    u_val_len += len;
+		    Unzval_size[lb] = len;
+		}
+	    else
+		{
+		    Unzval_size[lb] = 0;
+		}
+	}
+
+	gpu_mem_used += l_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * l_k * sizeof(int_t);
+	gpu_mem_used += u_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * u_k * sizeof(int_t);
+
+	/*left memory shall be divided among the two */
+
+	for (int_t i = 0;  i < l_k; ++i)
+	{
+	    temp_LnzvalPtr[i] = -1;
+	}
+
+	for (int_t i = 0; i < u_k; ++i)
+	{
+	    temp_UnzvalPtr[i] = -1;
+	}
+
+	/*setting these pointers back */
+	l_val_len = 0;
+	u_val_len = 0;
+
+	int_t num_gpu_l_blocks = 0;
+	int_t num_gpu_u_blocks = 0;
+	size_t mem_l_block, mem_u_block;
+
+	/* Find the trailing matrix size that can fit into GPU memory */
+	for (int_t i = nsupers - 1; i > -1; --i)
+	{
+	    /* ulte se chalte hai eleimination tree  */
+	    /* bottom up ordering  */
+	    int_t i_sup = A_gpu->perm_c_supno[i];
+
+	    int_t pc = PCOL( i_sup, grid );
+	    if (isNodeInMyGrid[i_sup] == 1)
+		{
+		    if (mycol == pc )
+			{
+			    int_t ljb  = LBj(i_sup, grid);
+			    mem_l_block = sizeof(double) * Lnzval_size[ljb];
+			    if (gpu_mem_used + mem_l_block > max_gpu_memory)
+				{
+				    break;
+				}
+				else
+				{
+				    gpu_mem_used += mem_l_block;
+				    temp_LnzvalPtr[ljb] = l_val_len;
+				    l_val_len += Lnzval_size[ljb];
+				    num_gpu_l_blocks++;
+				    A_gpu->first_l_block_gpu = i;
+				}
+			}
+
+			int_t pr = PROW( i_sup, grid );
+			if (myrow == pr)
+			{
+			    int_t lib  = LBi(i_sup, grid);
+			    mem_u_block = sizeof(double) * Unzval_size[lib];
+			    if (gpu_mem_used + mem_u_block > max_gpu_memory)
+				{
+				    break;
+				}
+			    else
+				{
+				    gpu_mem_used += mem_u_block;
+				    temp_UnzvalPtr[lib] = u_val_len;
+				    u_val_len += Unzval_size[lib];
+				    num_gpu_u_blocks++;
+				    A_gpu->first_u_block_gpu = i;
+				}
+			}
+		} /* endif */
+
+	} /* endfor i .... nsupers */
+
+#if (PRNTlevel>=2)
+	printf("(%d) Number of L blocks in GPU %d, U blocks %d\n",
+	       grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks );
+	printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n",
+	       grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu);
+	printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam,
+	       l_val_len * sizeof(double) * 1e-9,
+	       u_val_len * sizeof(double) * 1e-9,
+	       gpu_mem_used * 1e-9, max_gpu_memory * 1e-9);
+	fflush(stdout);
+#endif
+
+	/* Assemble index vector on temp */
+	int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len);
+	for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  =   index_host[0]; /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    
+			    memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ;
+			}
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  l_ind_len * sizeof(int_t))) ;
+	A_gpu->LrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_val_len * sizeof(double)));
+	A_gpu->LnzvalVec = (double *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(double)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->LnzvalPtr_host = temp_LnzvalPtr;
+
+	int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len);
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len1 = index_host[2];
+		    memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1);
+		}
+	}
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_ind_len * sizeof(int_t))) ;
+	A_gpu->UrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_val_len * sizeof(double)));
+	A_gpu->UnzvalVec = (double *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(double)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->UnzvalPtr_host = temp_UnzvalPtr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  (nsupers + 1)*sizeof(int_t))) ;
+	A_gpu->xsup = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  sizeof(dLUstruct_gpu_t))) ;
+	*dA_gpu = (dLUstruct_gpu_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(dLUstruct_gpu_t), gpuMemcpyHostToDevice)) ;
+
+	free (temp_LrowindPtr);
+	free (temp_UrowindPtr);
+	free (indtemp1);
+	free (indtemp);
+
+} /* end dCopyLUToGPU3D */
+
+
+
+int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+				int_t** treePerm,
+				dLUValSubBuf_t*LUvsb,
+				   dLUstruct_t* LUstruct,
+			           gridinfo3d_t* grid3d,
+				   dsluGPU_t *sluGPU,
+				   d2Hreduce_t* d2Hred,
+				   factStat_t *factStat,
+				   HyP_t* HyP, SCT_t* SCT )
+{
+    // first synchronize all gpu streams
+    int superlu_acc_offload =   HyP->superlu_acc_offload;
+
+    int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t* gpuLUreduced = factStat->gpuLUreduced;
+
+    int_t sender;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    
+	}
+    else
+	{
+	    sender = myGrid;
+	}
+
+    /*Reduce all the ancestors from the GPU*/
+    if (myGrid == sender && superlu_acc_offload)
+    {
+        for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+	    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+	    sluGPU->lastOffloadStream[streamId] = -1;
+	}
+
+	for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+
+	    for (int_t node = 0; node < nsAncestor; node++ )
+	    {
+	        int_t k = cAncestorList[node];
+	        if (!gpuLUreduced[k])
+		{
+		    dinitD2Hreduce(k, d2Hred, 1,
+				  HyP, sluGPU, grid, LUstruct, SCT);
+		    int_t copyL_kljb = d2Hred->copyL_kljb;
+		    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+		    double tt_start1 = SuperLU_timer_();
+		    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+		    if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+		    dsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+		    /*
+		      Reduce the LU panels from GPU
+		    */
+		    dreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct);
+		    gpuLUreduced[k] = 1;
+		}
+	    }
+	}
+    } /*if (myGrid == sender)*/
+
+    dreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+	                      LUvsb, LUstruct, grid3d, SCT );
+    return 0;
+} /* dreduceAllAncestors3d_GPU */
+
+
+void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT)
+{
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+        double ttx = SuperLU_timer_();
+        gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+        SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+        sluGPU->lastOffloadStream[streamId] = -1;
+     }
+}
diff --git a/SRC/dsuperlu_gpu.hip.cpp b/SRC/dsuperlu_gpu.hip.cpp
new file mode 100644
index 00000000..b71efe5a
--- /dev/null
+++ b/SRC/dsuperlu_gpu.hip.cpp
@@ -0,0 +1 @@
+#include "dsuperlu_gpu.cu"
\ No newline at end of file
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
new file mode 100644
index 00000000..b7fbdbff
--- /dev/null
+++ b/SRC/dtreeFactorization.c
@@ -0,0 +1,763 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_ddefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t dLluBufInit(dLUValSubBuf_t* LUvsb, dLUstruct_t *LUstruct)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    ddiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (ddiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(ddiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (ddiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(ddiagFactBufs_t));
+        assert(dFBufs[i]);
+        dinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct)
+{
+    dLUValSubBuf_t** LUvsbs = (dLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(dLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (dLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+        dLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  dscuBufs_t* scuBufs,
+                  dLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = dgetBigV(ldt, num_threads);
+    scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int dfreeScuBufs(dscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t ddenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    dLUValSubBuf_t* LUvsb,
+    ddiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	dDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	dIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	dUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	dIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	dWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	dWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= dSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            double* bigV = scuBufs->bigV;
+            double* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                double** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                double *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                double *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                dblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* ddenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t dsparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    dscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dsparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = dSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            double* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    dLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    dUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit dsparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* dsparseTreeFactor_ASYNC */
diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c
new file mode 100644
index 00000000..e2b66c34
--- /dev/null
+++ b/SRC/dtreeFactorizationGPU.c
@@ -0,0 +1,758 @@
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * </pre>
+ */
+// #include "treeFactorization.h"
+// #include "trfCommWrapper.h"
+#include "dlustruct_gpu.h"
+//#include "cblas.h"
+
+#ifdef GPU_ACC ///////////////// enable GPU
+
+/* 
+/-- num_u_blks--\ /-- num_u_blks_Phi --\
+----------------------------------------
+|  host_cols    ||    GPU   |   host   |
+----------------------------------------
+                  ^          ^
+                  0          jj_cpu
+*/
+#if 0
+static int_t getAccUPartition(HyP_t *HyP)
+{
+    /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
+    int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                        estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu);
+
+    int jj_cpu;
+
+#if 0 /* Ignoe those estimates */
+    jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
+                                   HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
+#else /* Sherry: new */
+    jj_cpu = HyP->num_u_blks_Phi;
+#endif
+
+    if (jj_cpu != 0 && HyP->Rnbrow > 0) // ###
+    {
+        HyP->offloadCondition = 1;
+    }
+    else
+    {
+        HyP->offloadCondition = 0;
+        jj_cpu = 0; // ###
+    }
+
+    return jj_cpu;
+}
+#endif
+
+int dsparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    dscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    dLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    ddiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    dsluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info)
+{
+    // sforest.nNodes, sforest.nodeList,
+    // &sforest.topoInfo,
+    int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
+    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
+    int_t *myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t *grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t *IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t *factored_D = factStat->factored_D;
+    int_t *factored_L = factStat->factored_L;
+    int_t *factored_U = factStat->factored_U;
+    int_t *IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t *IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t *gpuLUreduced = factStat->gpuLUreduced;
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+
+    // int_t numLAMax = getNumLookAhead();
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax; // number of look-ahead panels
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
+    int_t last_flag = 1;                       /* for updating nsuper-1 only once */
+    int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams
+
+    if (superlu_acc_offload)
+        dsyncAllfunCallStreams(sluGPU, SCT);
+
+    /* Go through each leaf node */
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+
+        /* If LU panels from GPU are not reduced, then reduce
+	   them before diagonal factorization */
+        if (!gpuLUreduced[k] && superlu_acc_offload)
+        {
+            double tt_start1 = SuperLU_timer_();
+
+            dinitD2Hreduce(k, d2Hred, last_flag,
+                          HyP, sluGPU, grid, LUstruct, SCT);
+            int_t copyL_kljb = d2Hred->copyL_kljb;
+            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+            if (copyL_kljb || copyU_kljb)
+                SCT->PhiMemCpyCounter++;
+            dsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+
+            dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct);
+
+            gpuLUreduced[k] = 1;
+            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+        }
+
+        double t1 = SuperLU_timer_();
+
+        /*Now factor and broadcast diagonal block*/
+        // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+        //                 options, thresh, LUstruct, stat, info, SCT);
+
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+        dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                        factStat->IrecvPlcd_D,
+                        comReqss[offset]->U_diag_blk_recv_req,
+                        comReqss[offset]->L_diag_blk_recv_req,
+                        comReqss[offset]->U_diag_blk_send_req,
+                        comReqss[offset]->L_diag_blk_send_req,
+                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+
+        SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+    } /* for all leaves ... */
+
+    //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
+
+    /* Process supernodal etree level by level */
+    for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    {
+        //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        /* code */
+        int k_st = eTreeTopLims[topoLvl];
+        int k_end = eTreeTopLims[topoLvl + 1];
+
+        /* Process all the nodes in 'topoLvl': diagonal factorization */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            if (!factored_D[k])
+            {
+                /*If LU panels from GPU are not reduced then reduce
+		  them before diagonal factorization*/
+                if (!gpuLUreduced[k] && superlu_acc_offload)
+                {
+                    double tt_start1 = SuperLU_timer_();
+                    dinitD2Hreduce(k, d2Hred, last_flag,
+                                     HyP, sluGPU, grid, LUstruct, SCT);
+                    int_t copyL_kljb = d2Hred->copyL_kljb;
+                    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                    if (copyL_kljb || copyU_kljb)
+                        SCT->PhiMemCpyCounter++;
+                    dsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+                    /*
+                        Reduce the LU panels from GPU
+                    */
+                    dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid,
+		    		     LUstruct);
+
+                    gpuLUreduced[k] = 1;
+                    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                }
+
+                double t1 = SuperLU_timer_();
+                /* Factor diagonal block on CPU */
+                // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                //                 options, thresh, LUstruct, stat, info, SCT);
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+                dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                                factStat->IrecvPlcd_D,
+                                comReqss[offset]->U_diag_blk_recv_req,
+                                comReqss[offset]->L_diag_blk_recv_req,
+                                comReqss[offset]->U_diag_blk_send_req,
+                                comReqss[offset]->L_diag_blk_send_req,
+                                grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+            }
+        } /* for all nodes in this level */
+
+        //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+
+        double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
+
+        /* Process all the nodes in 'topoLvl': panel updates on CPU */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+                dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+                              comReqss[offset]->U_diag_blk_recv_req,
+                              dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+                dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+                              dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+                              packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        } /* end panel update */
+
+        //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+
+        /* Process all the panels in look-ahead window: 
+	   broadcast L and U panels. */
+        for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+            {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+                                  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+                                  LUvsbs[offset]->Lval_buf, factStat->factored,
+                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                                  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+                                  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        } /* end for panels in look-ahead window */
+
+        //printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
+
+        /* Process all the nodes in level 'topoLvl': Schur complement update
+	   (no MPI communication)  */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+
+            double tsch = SuperLU_timer_();
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+            dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU,
+                   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                   grid, LUstruct, SCT);
+            dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+
+            int_t LU_nonempty = dSchurComplementSetupGPU(k,
+                                                         msgss[offset], packLUInfo,
+                                                         myIperm, gIperm_c_supno, perm_c_supno,
+                                                         gEtreeInfo, fNlists, scuBufs,
+                                                         LUvsbs[offset], grid, LUstruct, HyP);
+            // initializing D2H data transfer. D2H = Device To Host.
+            int_t jj_cpu; /* limit between CPU and GPU */
+
+#if 1
+            if (superlu_acc_offload)
+            {
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                /* code */
+                HyP->offloadCondition = 0;
+                jj_cpu = 0;
+            }
+
+#else
+            if (superlu_acc_offload)
+            {
+                jj_cpu = getAccUPartition(HyP);
+
+                if (jj_cpu > 0)
+                    jj_cpu = HyP->num_u_blks_Phi;
+
+                /* Sherry force this --> */
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                jj_cpu = 0;
+            }
+#endif
+
+            // int_t jj_cpu = HyP->num_u_blks_Phi-1;
+            // if (HyP->Rnbrow > 0 && jj_cpu>=0)
+            //     HyP->offloadCondition = 1;
+            // else
+            //     HyP->offloadCondition = 0;
+            //     jj_cpu=0;
+#if 0
+	    if ( HyP->offloadCondition ) {
+	    printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n",
+		   grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi ,
+		   HyP->num_u_blks, HyP->num_u_blks_Phi,
+		   jj_cpu, HyP->offloadCondition);
+	    fflush(stdout);
+	    }
+#endif
+            scuStatUpdate(SuperSize(k), HyP, SCT, stat);
+
+            int_t offload_condition = HyP->offloadCondition;
+            uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int *indirect = fNlists->indirect;
+            int *indirect2 = fNlists->indirect2;
+
+            /* Schur Complement Update */
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC(k + 1);
+
+            double *bigV = scuBufs->bigV;
+            double *bigU = scuBufs->bigU;
+
+            double t1 = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel /* Look-ahead update on CPU */
+#endif
+            {
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub,
+                                               usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub,
+                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->RemainBlk;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+            }     /* end parallel region ... end look-ahead update */
+
+            SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+
+            //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+
+            /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1)
+            { /* Not the root */
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
+                { /* if k is the last child in this level */
+                    int_t k0_parent = myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+                        //	      exit(-1);
+                        assert(k0_parent < nnodes);
+                        int offset = k0_parent - k_end;
+                        if (!gpuLUreduced[k_parent] && superlu_acc_offload)
+                        {
+                            double tt_start1 = SuperLU_timer_();
+
+                            dinitD2Hreduce(k_parent, d2Hred, last_flag,
+                                          HyP, sluGPU, grid, LUstruct, SCT);
+                            int_t copyL_kljb = d2Hred->copyL_kljb;
+                            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                            if (copyL_kljb || copyU_kljb)
+                                SCT->PhiMemCpyCounter++;
+                            dsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU);
+
+                            /* Reduce the LU panels from GPU */
+                            dreduceGPUlu(last_flag, d2Hred,
+                                        sluGPU, SCT, grid, LUstruct);
+
+                            gpuLUreduced[k_parent] = 1;
+                            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                        }
+
+                        /* Factorize diagonal block on CPU */
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+                        dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+                                        dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+                                        comReqss[offset]->U_diag_blk_recv_req,
+                                        comReqss[offset]->L_diag_blk_recv_req,
+                                        comReqss[offset]->U_diag_blk_send_req,
+                                        comReqss[offset]->L_diag_blk_send_req,
+                                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    } /* end if k0_parent > 0 */
+
+                } /* end if all children are done */
+            }     /* end if non-root */
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+            {
+                /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
+#pragma omp master
+#endif
+                {
+                    if (superlu_acc_offload)
+                    {
+#ifdef _OPENMP
+                        int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
+                        double t1 = SuperLU_timer_();
+
+                        if (offload_condition)
+                        {
+                            SCT->datatransfer_count++;
+                            int streamId = k0 % nGPUStreams;
+
+                            /*wait for previous offload to get finished*/
+                            if (sluGPU->lastOffloadStream[streamId] != -1)
+                            {
+                                dwaitGPUscu(streamId, sluGPU, SCT);
+                                sluGPU->lastOffloadStream[streamId] = -1;
+                            }
+
+                            int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow;
+                            int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
+                            assert(bigu_send_size < HyP->bigu_size);
+
+                            /* !! Sherry add the test to avoid seg_fault inside
+                                  sendSCUdataHost2GPU */
+                            if (bigu_send_size > 0)
+                            {
+                                dsendSCUdataHost2GPU(streamId, lsub, usub,
+                                              bigU, bigu_send_size,
+                                              Remain_lbuf_send_size, sluGPU, HyP);
+
+                                sluGPU->lastOffloadStream[streamId] = k0;
+                                int_t usub_len = usub[2];
+                                int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+                                //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+                                dSchurCompUpdate_GPU(
+                                    streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                    Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                    HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid);
+                            } /* endif bigu_send_size > 0 */
+
+                            // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
+
+                            SCT->schurPhiCallCount++;
+                            HyP->jj_cpu = jj_cpu;
+                            updateDirtyBit(k0, HyP, grid);
+                        } /* endif (offload_condition) */
+
+                        double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double)(t2 - t1);                                     // Sherry added
+
+                    } /* endif (superlu_acc_offload) */
+
+                } /* end omp master thread */
+		
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                /* The following update is on CPU. Should not be necessary now,
+		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
+                {
+                    //printf(".. WARNING: should NOT get here\n");
+                    int_t j = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+
+            } /* end omp parallel region */
+
+            //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+            Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req,
+                            comReqss[abs_offset]->L_diag_blk_send_req,
+                            grid, SCT);
+#endif
+
+            /*Schedule next I bcasts within look-ahead window */
+            for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                                      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+                                      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+                                      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf,
+                                      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            } /* end for look-ahead window */
+
+            if (topoLvl < maxTopoLevel - 1) /* not root */
+            {
+                /*look-ahead LU factorization*/
+                int kx_st = eTreeTopLims[topoLvl + 1];
+                int kx_end = eTreeTopLims[topoLvl + 2];
+                for (int k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int kx = perm_c_supno[k0x];
+                    int offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+                            dLPanelTrSolve(kx, factStat->factored_L,
+                                           dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within look-ahead window
+                                factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt,
+                                                  comReqss[offset1]->send_req,
+                                                  comReqss[offset1]->recv_req,
+                                                  LUvsbs[offset1]->Lsub_buf,
+                                                  LUvsbs[offset1]->Lval_buf,
+                                                  factStat->factored,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+                            dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor,
+                                           scuBufs->bigV,
+                                           ldt, packLUInfo->Ublock_info,
+                                           grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within lookahead window
+                                factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt,
+                                                  comReqss[offset]->send_requ,
+                                                  comReqss[offset]->recv_requ,
+                                                  LUvsbs[offset]->Usub_buf,
+                                                  LUvsbs[offset]->Uval_buf,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                } /* end look-ahead */
+
+            } /* end if non-root level */
+
+            /* end Schur complement update */
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+        } /* end Schur update for all the nodes in level 'topoLvl' */
+
+    } /* end for all levels of the tree */
+
+    return 0;
+} /* end dsparseTreeFactor_ASYNC_GPU */
+
+#endif // matching: enable GPU
diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c
new file mode 100644
index 00000000..30c98fc3
--- /dev/null
+++ b/SRC/dtrfAux.c
@@ -0,0 +1,757 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Auxiliary routine for 3D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
+#include "superlu_ddefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* dInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, dLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    dinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t dSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    double *bigU,
+    int_t* Lsub_buf,
+    double *Lval_buf,
+    int_t* Usub_buf,
+    double *Uval_buf,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    double* uval;
+    int_t* lsub;
+    double* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        dgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* dSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t dSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    dscuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, dLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    double * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    double* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    double* uval;
+    int_t* lsub;
+    double* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        dRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            dRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* dSchurComplementSetupGPU */
+
+
+double* dgetBigV(int_t ldt, int_t num_threads)
+{
+    double *bigV;
+    if (!(bigV = doubleMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+double* dgetBigU(int_t nsupers, gridinfo_t *grid, dLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    double* bigU = doubleMalloc_dist(bigu_size);
+
+    return bigU;
+} /* dgetBigU */
+
+
+trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      dLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter dinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    dinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+    dLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* dinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit dDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
new file mode 100644
index 00000000..c7b51a54
--- /dev/null
+++ b/SRC/dtrfCommWrapper.c
@@ -0,0 +1,548 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t dDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     double *BlockUFactor,
+                     double *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     dLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        dIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        dIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Dgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        dPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        dISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        dISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
+		      double* BlockUFactor,
+		      gridinfo_t *grid,
+		      dLUstruct_t *LUstruct)
+{
+    double alpha = 1.0;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        double *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        double* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        double *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        double* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* dLPanelTrSolve */
+
+int_t dLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    double* BlockUFactor,
+                    gridinfo_t *grid,
+                    dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    dUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* dLPanelUpdate */
+
+#define BL  32
+
+int_t dUPanelTrSolve( int_t k,  
+                     double* BlockLFactor,
+                     double* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     dLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        double *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        double *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                double *tempv = bigV +  thread_id * ldt * ldt;
+                dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        double *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            double *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            double *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    double *tempv = bigV +  thread_id * ldt * ldt;
+                    dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* dUPanelTrSolve */
+
+int_t dUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    double* BlockLFactor,
+                    double* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    dLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t dIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    double* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        double* lusup = Lnzval_bc_ptr[lk];
+
+        dIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(double));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            dIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, double* Uval_buf,
+    			     gridinfo_t *grid, dLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        double* uval = Unzval_br_ptr[lk];
+        dIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(double));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            dIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t dWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            dWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t dWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            dWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index 1879594e..0fd6d9a2 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -14,10 +14,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include <math.h>
@@ -392,6 +392,7 @@ void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, double c)
 
     return;
 }
+/**** end utilities added for SUNDIALS ****/
 
 /*! \brief Allocate storage in ScalePermstruct */
 void dScalePermstructInit(const int_t m, const int_t n,
@@ -420,9 +421,65 @@ void dScalePermstructFree(dScalePermstruct_t *ScalePermstruct)
         SUPERLU_FREE(ScalePermstruct->R);
         SUPERLU_FREE(ScalePermstruct->C);
         break;
+      default: break;
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int dDeAllocGlu_3d(dLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int dDeAllocLlu_3d(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* dDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void
@@ -431,8 +488,8 @@ dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx)
     int  i, j;
     for (j = 0; j < nrhs; ++j)
 	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/
-	    else x[i + j*ldx] = 1.0;
+	    if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n;
+	    else x[i + j*ldx] = 1.0 - (double)(i+1.)/n;
 	}
 }
 
@@ -559,7 +616,7 @@ void dPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 
 /*! \brief Sets all entries of matrix L to zero.
  */
-void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
+void dZeroLblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct)
 {
     double zero = 0.0;
     register int extra, gb, j, lb, nsupc, nsupr, ncb;
@@ -589,7 +646,7 @@ void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
             }
 	}
     }
-} /* dZeroLblocks */
+} /* end dZeroLblocks */
 
 
 /*! \brief Dump the factored matrix L using matlab triple-let format
@@ -598,8 +655,8 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, dLocalLU_t *Llu)
 {
     register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-	int_t nnzL, n,nmax;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
     int_t *xsup = Glu_persist->xsup;
     int_t *index;
     double *nzval;
@@ -652,7 +709,7 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		}
 
 	if(grid->iam==0){
-		fprintf(fp, "%d %d %d\n", n,n,nnzL);
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
 	}
 
      ncb = nsupers / grid->npcol;
@@ -1185,7 +1242,37 @@ void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
 	}
     }
-} /* DPRINTUBLOCKS */
+} /* end dPrintUlocks */
+
+/*! \brief Sets all entries of matrix U to zero.
+ */
+void dZeroUblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct)
+{
+    double zero = 0.0;
+    register int i, extra, lb, len, nrb;
+    register int myrow, r;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+    int nsupers = Glu_persist->supno[n-1] + 1;
+
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    len = index[1];  // number of entries in nzval[];
+	    for (i = 0; i < len; ++i) {
+	        nzval[i] = zero;
+	    }
+	}
+    }
+} /* end dZeroUlocks */
 
 int
 dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm,
diff --git a/SRC/get_perm_c_parmetis.c b/SRC/get_perm_c_parmetis.c
index 6381f4e5..b08c35ab 100644
--- a/SRC/get_perm_c_parmetis.c
+++ b/SRC/get_perm_c_parmetis.c
@@ -122,7 +122,7 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   int_t m, n, bnz, i, j;
   int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
   int_t *b_rowptr, *b_colind;
-  int_t *dist_order;
+  int_t *dist_order = NULL;
   int  *recvcnts, *displs;
   /* first row index on each processor when the matrix is distributed
      on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
@@ -260,7 +260,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   if (bnz) SUPERLU_FREE (b_colind);
   SUPERLU_FREE (b_rowptr);
 
-#if 0  
+#if 0  /* The following code is not needed anymore, because parmetis
+	  now supports 64bit indexing */
   if ( iam < noDomains) {
     SUPERLU_FREE (options);
   }
@@ -283,7 +284,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   dist_order = dist_order_int;
 #endif
 
-#endif
+#endif  /* not needed any more */
+
   
   /* Allgatherv dist_order to get perm_c */
   if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
diff --git a/SRC/gpu_wrapper.h b/SRC/gpu_wrapper.h
index bb644864..65499945 100644
--- a/SRC/gpu_wrapper.h
+++ b/SRC/gpu_wrapper.h
@@ -38,6 +38,15 @@
 #define gpuStreamCreateWithFlags cudaStreamCreateWithFlags
 #define gpuStreamDefault cudaStreamDefault
 #define gpublasStatus_t cublasStatus_t
+#define gpuEventCreate cudaEventCreate
+#define gpuEventRecord cudaEventRecord
+#define gpuMemGetInfo cudaMemGetInfo
+#define gpuOccupancyMaxPotentialBlockSize cudaOccupancyMaxPotentialBlockSize
+#define gpuEventElapsedTime cudaEventElapsedTime
+#define gpuDeviceReset cudaDeviceReset
+#define gpuMallocHost cudaMallocHost
+#define gpuEvent_t cudaEvent_t
+#define gpuMemset cudaMemset
 #define  GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS 
 #define  GPUBLAS_STATUS_NOT_INITIALIZED CUBLAS_STATUS_NOT_INITIALIZED 
 #define  GPUBLAS_STATUS_ALLOC_FAILED CUBLAS_STATUS_ALLOC_FAILED 
@@ -46,6 +55,8 @@
 #define  GPUBLAS_STATUS_MAPPING_ERROR CUBLAS_STATUS_MAPPING_ERROR 
 #define  GPUBLAS_STATUS_EXECUTION_FAILED CUBLAS_STATUS_EXECUTION_FAILED 
 #define  GPUBLAS_STATUS_INTERNAL_ERROR CUBLAS_STATUS_INTERNAL_ERROR 
+#define  GPUBLAS_STATUS_LICENSE_ERROR CUBLAS_STATUS_LICENSE_ERROR 
+#define  GPUBLAS_STATUS_NOT_SUPPORTED CUBLAS_STATUS_NOT_SUPPORTED 
 #define  gpublasCreate cublasCreate 
 #define  gpublasDestroy cublasDestroy
 #define  gpublasHandle_t cublasHandle_t
@@ -105,6 +116,15 @@
 #define gpuStreamCreateWithFlags hipStreamCreateWithFlags
 #define gpuStreamDefault hipStreamDefault
 #define gpublasStatus_t hipblasStatus_t
+#define gpuEventCreate hipEventCreate
+#define gpuEventRecord hipEventRecord
+#define gpuMemGetInfo hipMemGetInfo
+#define gpuOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define gpuEventElapsedTime hipEventElapsedTime
+#define gpuDeviceReset hipDeviceReset
+#define gpuMallocHost hipMallocHost
+#define gpuEvent_t hipEvent_t
+#define gpuMemset hipMemset
 #define  GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS 
 #define  GPUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED 
 #define  GPUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED 
@@ -113,6 +133,8 @@
 #define  GPUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR 
 #define  GPUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED 
 #define  GPUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR 
+#define  GPUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_LICENSE_ERROR 
+#define  GPUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED 
 #define  gpublasCreate hipblasCreate 
 #define  gpublasDestroy hipblasDestroy
 #define  gpublasHandle_t hipblasHandle_t
diff --git a/SRC/gpublas_utils.c b/SRC/gpublas_utils.c
index ad0c1534..dff19cf9 100644
--- a/SRC/gpublas_utils.c
+++ b/SRC/gpublas_utils.c
@@ -72,11 +72,14 @@ const char* gpublasGetErrorString(gpublasStatus_t status)
         case GPUBLAS_STATUS_MAPPING_ERROR: return "GPUBLAS_STATUS_MAPPING_ERROR";
         case GPUBLAS_STATUS_EXECUTION_FAILED: return "GPUBLAS_STATUS_EXECUTION_FAILED"; 
         case GPUBLAS_STATUS_INTERNAL_ERROR: return "GPUBLAS_STATUS_INTERNAL_ERROR"; 
+        case GPUBLAS_STATUS_LICENSE_ERROR: return "GPUBLAS_STATUS_LICENSE_ERROR"; 
+        case GPUBLAS_STATUS_NOT_SUPPORTED: return "GPUBLAS_STATUS_NOT_SUPPORTED"; 
     }
     return "unknown error";
 }
 
-inline
+/*error reporting functions */
+//inline
 gpuError_t checkGPU(gpuError_t result)
 {
 #if defined(DEBUG) || defined(_DEBUG)
diff --git a/SRC/gpublas_utils.h b/SRC/gpublas_utils.h
index 895bdc6d..72cab46c 100644
--- a/SRC/gpublas_utils.h
+++ b/SRC/gpublas_utils.h
@@ -22,6 +22,7 @@ at the top-level directory.
 #ifdef GPU_ACC
 
 #include "gpu_wrapper.h"
+typedef struct LUstruct_gpu_  LUstruct_gpu;  // Sherry - not in this distribution
 
 extern void DisplayHeader();
 extern const char* gpublasGetErrorString(gpublasStatus_t status);
diff --git a/SRC/memory.c b/SRC/memory.c
index 8a253e9a..bee42f8f 100644
--- a/SRC/memory.c
+++ b/SRC/memory.c
@@ -66,8 +66,8 @@ void *superlu_malloc_dist(size_t size)
     int iam;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &iam);
-    if ( size <= 0 ) {
-	printf("(%d) superlu_malloc size %lld\n", iam, size);
+    if ( size < 0 ) {
+	printf("(%d) superlu_malloc size %lu\n", iam, size);
 	ABORT("superlu_malloc: nonpositive size");
     }
 // #ifdef GPU_ACC    
@@ -77,7 +77,7 @@ void *superlu_malloc_dist(size_t size)
 // #endif
 	
     if ( !buf ) {
-	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lld\n",
+	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lu\n",
 	       iam, superlu_malloc_total*1e-6, size);
 	ABORT("superlu_malloc: out of memory");
     }
@@ -143,7 +143,7 @@ void superlu_free_dist(void *addr) { gpuError_t error = gpuFree(addr);}
 #else 
 
 #if  0 
-// #if (__STDC_VERSION__ >= 201112L)    // this is very slow on tulip
+// #if (__STDC_VERSION__ >= 201112L)   // cannot compile on Summit, also this is very slow on tulip
 
 void * superlu_malloc_dist(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);}
 void   superlu_free_dist(void * ptr)    {free(ptr);}
@@ -426,8 +426,7 @@ int_t symbfact_SubXpand
     
 #if ( DEBUGlevel>=1 )
     printf("symbfact_SubXpand(): jcol " IFMT ", next " IFMT ", maxlen " IFMT
-	   ", MemType " IFMT "\n",
-	   jcol, next, *maxlen, mem_type);
+	   ", MemType %d\n", jcol, next, *maxlen, mem_type);
 #endif    
 
     new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
new file mode 100644
index 00000000..ee57c7e7
--- /dev/null
+++ b/SRC/pd3dcomm.c
@@ -0,0 +1,876 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_ddefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(double), MPI_INFO_NULL, &(a)))
+
+int_t dAllocLlu(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    double  **Lnzval_bc_ptr =
+	(double **) SUPERLU_MALLOC(sizeof(double*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    double  **Unzval_br_ptr =
+	(double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* dAllocLlu */
+
+int_t dmpiMallocLUStruct(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    double * uval = Unzval_br_ptr[lb];
+	    double * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(double));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    double *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(double));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t dzSendLPanel(int_t k, int_t receiver,
+                   dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(double);
+		}
+	}
+    return 0;
+}
+
+
+int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
+                    double* Lval_buf,
+                    dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_dscal(len2, alpha, lnzval, 1);
+		    superlu_daxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t dzSendUPanel(int_t k, int_t receiver,
+                    dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    double* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(double);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
+                    double* Uval_buf, dLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    double* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, MPI_DOUBLE, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_dscal(lenv, alpha, unzval, 1);
+		    superlu_daxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t dp3dScatter(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	dAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	dAllocLlu(nsupers, LUstruct, grid3d);
+    
+    dLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    dscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    dscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* dp3dScatter */
+
+
+int_t dscatter3dUPanels(int_t nsupers,
+		       dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	double * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end dScatter3dUPanels */
+
+
+int_t dscatter3dLPanels(int_t nsupers,
+                       dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = doubleCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* dscatter3dLPanels */
+
+int_t dcollect3dLpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	double * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    dcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    dcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t dzeroSetLU(int_t nnodes, int_t* nodeList, dLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    double* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(double) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    double* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(double) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t dreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        double* Lval_buf, double* Uval_buf,
+                        dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 1.0, beta = 1.0;	
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t dgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        dLUValSubBuf_t* LUvsb,
+                        dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 0.0, beta = 1.0;	
+    double * Lval_buf  = LUvsb->Lval_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, dLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             dLUValSubBuf_t* LUvsb, dLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    double * Lval_buf  = LUvsb->Lval_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t dgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    dLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    dgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* dgatherAllFactoredLU */
+
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
index 4b7e9589..36ee80b0 100644
--- a/SRC/pddistribute.c
+++ b/SRC/pddistribute.c
@@ -13,9 +13,10 @@ at the top-level directory.
 /*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * <pre>
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * </pre>
  */
 
@@ -75,9 +76,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     int_t  SendCnt; /* number of remote nonzeros to be sent */
     int_t  RecvCnt; /* number of remote nonzeros to be sent */
     int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ia, *ja, **ia_send, *index, *itemp = NULL;
     int_t  *ptr_to_send;
-    double *aij, **aij_send, *nzval, *dtemp;
+    double *aij, **aij_send, *nzval, *dtemp = NULL;
     double *nzval_a;
 	double asum,asum_tot;
     int    iam, it, p, procs, iam_g;
@@ -143,8 +144,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
             ABORT("Malloc fails for ia[].");
         if ( !(aij = doubleMalloc_dist(k)) )
             ABORT("Malloc fails for aij[].");
+        ja = ia + k;
     }
-    ja = ia + k;
 
     /* Allocate temporary storage for sending/receiving the A triplets. */
     if ( procs > 1 ) {
@@ -172,9 +173,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
 
       for (i = 0, j = 0, p = 0; p < procs; ++p) {
           if ( p != iam ) {
-	      ia_send[p] = &index[i];
+	      if (nnzToSend[p] > 0) ia_send[p] = &index[i];
 	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	      aij_send[p] = &nzval[j];
+	      if (nnzToSend[p] > 0) aij_send[p] = &nzval[j];
 	      j += nnzToSend[p];
 	  }
       }
@@ -218,7 +219,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
        NOTE: Can possibly use MPI_Alltoallv.
        ------------------------------------------------------------*/
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) {
+    	//if ( p != iam ) {
 	    it = 2*nnzToSend[p];
 	    MPI_Isend( ia_send[p], it, mpi_int_t,
 		       p, iam, grid->comm, &send_req[p] );
@@ -229,7 +231,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToRecv[p] > 0 ) {
+	//if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
 	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
@@ -248,7 +251,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang
+        //if ( p != iam ) {
 	    MPI_Wait( &send_req[p], &status);
 	    MPI_Wait( &send_req[procs+p], &status);
 	}
diff --git a/SRC/pdgsequ.c b/SRC/pdgsequ.c
index 8702fbf5..95adf5b6 100644
--- a/SRC/pdgsequ.c
+++ b/SRC/pdgsequ.c
@@ -13,7 +13,7 @@ at the top-level directory.
 /*! @file
  * \brief Computes row and column scalings
  *
- * File name:	pdgsequ.c
+ * File name:   pdgsequ.c
  * History:     Modified from LAPACK routine DGEEQU
  */
 #include <math.h>
@@ -84,7 +84,7 @@ at the top-level directory.
 
 void
 pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
-	double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
+        double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
 {
 
     /* Local variables */
@@ -102,20 +102,22 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
     /* Test the input parameters. */
     *info = 0;
     if ( A->nrow < 0 || A->ncol < 0 ||
-	 A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE )
-	*info = -1;
-    if (*info != 0) {
-	i = -(*info);
-	pxerr_dist("pdgsequ", grid, i);
-	return;
+            A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE )
+        *info = -1;
+    if (*info != 0)
+    {
+        i = -(*info);
+        pxerr_dist("pdgsequ", grid, i);
+        return;
     }
 
     /* Quick return if possible */
-    if ( A->nrow == 0 || A->ncol == 0 ) {
-	*rowcnd = 1.;
-	*colcnd = 1.;
-	*amax = 0.;
-	return;
+    if ( A->nrow == 0 || A->ncol == 0 )
+    {
+        *rowcnd = 1.;
+        *colcnd = 1.;
+        *amax = 0.;
+        return;
     }
 
     Astore = A->Store;
@@ -131,43 +133,49 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
 
     /* Find the maximum element in each row. */
     irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
-	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
-	++irow;
+    for (i = 0; i < m_loc; ++i)
+    {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j)
+            r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
+        ++irow;
     }
 
     /* Find the maximum and minimum scale factors. */
     rcmin = bignum;
     rcmax = 0.;
-    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
-	rcmax = SUPERLU_MAX(rcmax, r[i]);
-	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i)
+    {
+        rcmax = SUPERLU_MAX(rcmax, r[i]);
+        rcmin = SUPERLU_MIN(rcmin, r[i]);
     }
 
     /* Get the global MAX and MIN for R */
     tempmax = rcmax;
     tempmin = rcmin;
     MPI_Allreduce( &tempmax, &rcmax,
-		1, MPI_DOUBLE, MPI_MAX, grid->comm);
+                   1, MPI_DOUBLE, MPI_MAX, grid->comm);
     MPI_Allreduce( &tempmin, &rcmin,
-		1, MPI_DOUBLE, MPI_MIN, grid->comm);
+                   1, MPI_DOUBLE, MPI_MIN, grid->comm);
 
     *amax = rcmax;
 
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (i = 0; i < A->nrow; ++i)
-	    if (r[i] == 0.) {
-		*info = i + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (i = 0; i < A->nrow; ++i)
-	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
-	/* Compute ROWCND = min(R(I)) / max(R(I)) */
-	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    if (rcmin == 0.)
+    {
+        /* Find the first zero scale factor and return an error code. */
+        for (i = 0; i < A->nrow; ++i)
+            if (r[i] == 0.)
+            {
+                *info = i + 1;
+                return;
+            }
+    }
+    else
+    {
+        /* Invert the scale factors. */
+        for (i = 0; i < A->nrow; ++i)
+            r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+        /* Compute ROWCND = min(R(I)) / max(R(I)) */
+        *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
     }
 
     /* Compute column scale factors */
@@ -176,17 +184,19 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
     /* Find the maximum element in each column, assuming the row
        scalings computed above. */
     irow = Astore->fst_row;
-    for (i = 0; i < m_loc; ++i) {
-        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
-	    jcol = Astore->colind[j];
-	    c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
-	}
-	++irow;
+    for (i = 0; i < m_loc; ++i)
+    {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j)
+        {
+            jcol = Astore->colind[j];
+            c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
+        }
+        ++irow;
     }
 
     /* Find the global maximum for c[j] */
     if ( !(loc_max = doubleMalloc_dist(A->ncol)))
-      ABORT("Malloc fails for loc_max[].");
+        ABORT("Malloc fails for loc_max[].");
     for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
     MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
     SUPERLU_FREE(loc_max);
@@ -194,34 +204,39 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
     /* Find the maximum and minimum scale factors. */
     rcmin = bignum;
     rcmax = 0.;
-    for (j = 0; j < A->ncol; ++j) {
-	rcmax = SUPERLU_MAX(rcmax, c[j]);
-	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    for (j = 0; j < A->ncol; ++j)
+    {
+        rcmax = SUPERLU_MAX(rcmax, c[j]);
+        rcmin = SUPERLU_MIN(rcmin, c[j]);
     }
 
-    if (rcmin == 0.) {
-	/* Find the first zero scale factor and return an error code. */
-	for (j = 0; j < A->ncol; ++j)
-	    if ( c[j] == 0. ) {
-		*info = A->nrow + j + 1;
-		return;
-	    }
-    } else {
-	/* Invert the scale factors. */
-	for (j = 0; j < A->ncol; ++j)
-	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
-	/* Compute COLCND = min(C(J)) / max(C(J)) */
-	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    if (rcmin == 0.)
+    {
+        /* Find the first zero scale factor and return an error code. */
+        for (j = 0; j < A->ncol; ++j)
+            if ( c[j] == 0. )
+            {
+                *info = A->nrow + j + 1;
+                return;
+            }
+    }
+    else
+    {
+        /* Invert the scale factors. */
+        for (j = 0; j < A->ncol; ++j)
+            c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+        /* Compute COLCND = min(C(J)) / max(C(J)) */
+        *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
     }
 
     /* gather R from each process to get the global R.  */
 
     procs = grid->nprow * grid->npcol;
     if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
-      ABORT("Malloc fails for r_sizes[].");
+        ABORT("Malloc fails for r_sizes[].");
     displs = r_sizes + procs;
     if ( !(loc_r = doubleMalloc_dist(m_loc)))
-      ABORT("Malloc fails for loc_r[].");
+        ABORT("Malloc fails for loc_r[].");
     j = Astore->fst_row;
     for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
 
@@ -230,15 +245,14 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
 
     /* Set up the displacements for allgatherv */
     displs[0] = 0;
-    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
+    for (i = 1; i < procs; ++i) displs[i] = displs[i - 1] + r_sizes[i - 1];
 
     /* Now gather the actual data */
     MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
-                MPI_DOUBLE, grid->comm);
+                   MPI_DOUBLE, grid->comm);
 
     SUPERLU_FREE(r_sizes);
     SUPERLU_FREE(loc_r);
-
     return;
 
 } /* pdgsequ */
diff --git a/SRC/pdgsmv.c b/SRC/pdgsmv.c
index db767647..1b2882f4 100644
--- a/SRC/pdgsmv.c
+++ b/SRC/pdgsmv.c
@@ -373,11 +373,11 @@ void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm)
     int_t *it;
     double *dt;
     SUPERLU_FREE(gsmv_comm->extern_start);
-    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
-    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
+    if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it);
+    if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it);
     SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
     SUPERLU_FREE(gsmv_comm->SendCounts);
-    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
-    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
+    if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt);
+    if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt);
 }
 
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index d42898e8..d0160c52 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -326,7 +326,7 @@ at the top-level directory.
  *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
  *                        of the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
- *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
  *                        matching to permute rows of the original matrix
  *                        to make the diagonal large relative to the
  *                        off-diagonal.
@@ -408,7 +408,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -416,7 +416,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -492,7 +492,7 @@ at the top-level directory.
  *
  * info    (output) int*
  *         = 0: successful exit
- *         < 0: if info = -i, the i-th argument had an illegal value   
+ *         < 0: if info = -i, the i-th argument had an illegal value  
  *         > 0: if info = i, and i is
  *             <= A->ncol: U(i,i) is exactly zero. The factorization has
  *                been completed, but the factor U is exactly singular,
@@ -590,13 +590,13 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
+    if ( Fact < DOFACT || Fact > FACTORED )
 	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+    else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR )
 	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+    else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC )
 	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+    else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA )
 	*info = -1;
     else if ( options->IterRefine == SLU_EXTRA ) {
 	*info = -1;
@@ -667,6 +667,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
 		break;
+	    default: break;
 	}
     }
 
@@ -914,7 +915,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
 	            }
 #endif
-                } else { /* use largeDiag_AWPM */
+                } else { /* use LargeDiag_HWPM */
 #ifdef HAVE_COMBBLAS
 		    d_c2cpp_GetHWPM(A, grid, ScalePermstruct);
 #else
@@ -1061,7 +1062,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
                    the nonzero data structures for L & U. */
 #if ( PRNTlevel>=1 )
                 if ( !iam ) {
-		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		    printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 		    fflush(stdout);
 	        }
@@ -1083,10 +1084,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		    	printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    	printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    	printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n",
-			       (int) sizeof(int_t), (int) sizeof(short),
-        		       (int) sizeof(float), (int) sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+		    	printf("\tint %lu, short %lu, float %lu, double %lu\n",
+			        sizeof(int_t), sizeof(short),
+        		        sizeof(float), sizeof(double));
+		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
 			   	symb_mem_usage.for_lu*1e-6,
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
@@ -1226,11 +1227,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 
 	MPI_Comm_rank( MPI_COMM_WORLD, &iam_g );
 
-    if (!iam_g) {
-	print_options_dist(options);
-	fflush(stdout);
-    }
-
     printf(".. Ainfo mygid %5d   mysid %5d   nnz_loc " IFMT "  sum_loc  %e lsum_loc   %e nnz " IFMT " nnzLU %ld sum %e  lsum %e  N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol);
 	fflush(stdout);
 #endif
@@ -1321,7 +1317,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
-		printf("**************************************************\n");
+		printf("**************************************************\n\n");
+		printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots);
 		fflush(stdout);
             }
 	} /* end printing stats */
@@ -1585,6 +1582,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default: break;
 	}
     }
 
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
new file mode 100644
index 00000000..1a74194b
--- /dev/null
+++ b/SRC/pdgssvx3d.c
@@ -0,0 +1,1589 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * </pre>
+ */
+
+void
+pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
+           dScalePermstruct_t * ScalePermstruct,
+           double B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
+           dLUstruct_t * LUstruct, dSOLVEstruct_t * SOLVEstruct,
+           double *berr, SuperLUStat_t * stat, int *info)
+{
+    NRformat_loc *Astore = A->Store;
+    SuperMatrix GA;        /* Global A in NC format */
+    NCformat *GAstore;
+    double *a_GA;
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+    /* The nonzero structures of L and U factors, which are
+       replicated on all processrs.
+       (lsub, xlsub) contains the compressed subscript of
+       supernodes in L.
+       (usub, xusub) contains the compressed subscript of
+       nonzero segments in U.
+       If options->Fact != SamePattern_SameRowPerm, they are
+       computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+       routine. They will be freed after PDDISTRIBUTE routine.
+       If options->Fact == SamePattern_SameRowPerm, these
+       structures are not used.                                  */
+    yes_no_t parSymbFact = options->ParSymbFact;
+    fact_t Fact;
+    double *a;
+    int_t *colptr, *rowind;
+    int_t *perm_r;              /* row permutations from partial pivoting */
+    int_t *perm_c;              /* column permutation vector */
+    int_t *etree;               /* elimination tree */
+    int_t *rowptr, *colind;     /* Local A in NR */
+    int_t colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t nnz_loc, m_loc, fst_row, icol;
+    int iam;
+    int ldx;                    /* LDA for matrix X (local). */
+    char equed[1], norm[1];
+    double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    double *X, *b_col, *b_work, *x_col;
+    double t;
+    float GA_mem_use;           /* memory usage by global A */
+    float dist_mem_use;         /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double dmin, dsum, dprod;
+#endif
+
+    LUstruct->dt = 'd';
+    
+    // get the 2d grid
+    gridinfo_t *grid  = &(grid3d->grid2d);
+    iam = grid->iam;
+    
+    /* Test the options choices. */
+    *info = 0;
+    Fact = options->Fact;
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA) {
+	*info = -1;
+        fprintf (stderr,
+	         "Extra precise iterative refinement yet to support.");
+    } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_D || A->Mtype != SLU_GE)
+	 *info = -2;
+    else if (ldb < Astore->m_loc)
+         *info = -5;
+    else if (nrhs < 0) {
+	 *info = -6;
+    }
+    if (*info) {
+	i = -(*info);
+	pxerr_dist ("pdgssvx3d", grid, -(*info));
+	return;
+    }
+    
+    /* Initialization. */
+
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d for the following 2D solve;
+    */
+    dGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (double *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+//////    
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter pdgssvx3d()");
+#endif
+	
+    /* Perform preprocessing steps on process layer zero, including:
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
+
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
+    {
+        m = A->nrow;
+    	n = A->ncol;
+	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
+
+	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+	A->Store = Astore0;  // on 2D grid-0
+	ldb = Astore0->m_loc;
+	
+	/* The following code now works on 2D grid-0 */
+    	Astore = (NRformat_loc *) A->Store;
+    	nnz_loc = Astore->nnz_loc;
+    	m_loc = Astore->m_loc;
+    	fst_row = Astore->fst_row;
+    	a = (double *) Astore->nzval;
+    	rowptr = Astore->rowptr;
+    	colind = Astore->colind;
+
+        /* Structures needed for parallel symbolic factorization */
+    	int_t *sizes, *fstVtxSep;
+	int noDomains, nprocs_num;
+    	MPI_Comm symb_comm;  /* communicator for symbolic factorization */
+    	int col, key; /* parameters for creating a new communicator */
+    	Pslu_freeable_t Pslu_freeable;
+    	float flinfo;
+    
+	sizes = NULL;
+	fstVtxSep = NULL;
+	symb_comm = MPI_COMM_NULL;
+	
+	Equil = (!factored && options->Equil == YES);
+	notran = (options->Trans == NOTRANS);
+	
+	iam = grid->iam;
+	job = 5;
+	/* Extract equilibration status from a previous factorization */
+	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+	    {
+		rowequ = (ScalePermstruct->DiagScale == ROW) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+		colequ = (ScalePermstruct->DiagScale == COL) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+	    }
+	else {
+	    rowequ = colequ = FALSE;
+	}
+	
+	/* The following arrays are replicated on all processes. */
+	perm_r = ScalePermstruct->perm_r;
+	perm_c = ScalePermstruct->perm_c;
+	etree = LUstruct->etree;
+	R = ScalePermstruct->R;
+	C = ScalePermstruct->C;
+	/********/
+	
+	/* Not factored & ask for equilibration */
+	if (Equil && Fact != SamePattern_SameRowPerm) {
+	    /* Allocate storage if not done so before. */
+	    switch (ScalePermstruct->DiagScale)	{
+		case NOEQUIL:
+		    if (!(R = (double  *) doubleMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->R = R;
+		    ScalePermstruct->C = C;
+		    break;
+		case ROW:
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->C = C;
+		    break;
+		case COL:
+		    if (!(R = (double *) doubleMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    ScalePermstruct->R = R;
+		    break;
+	        default: break;
+	    }
+	}
+	
+	/* ------------------------------------------------------------
+	   Diagonal scaling to equilibrate the matrix.
+	   ------------------------------------------------------------ */
+	if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Enter equil");
+#endif
+	    t = SuperLU_timer_ ();
+		
+	    if (Fact == SamePattern_SameRowPerm) {
+		/* Reuse R and C. */
+		switch (ScalePermstruct->DiagScale) {
+		case NOEQUIL:
+		    break;
+		case ROW:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j) {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+			    a[i] *= R[irow];    /* Scale rows. */
+			}
+			++irow;
+		    }
+		    break;
+		case COL:
+		    for (j = 0; j < m_loc; ++j)
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+			    icol = colind[i];
+			    a[i] *= C[icol];          /* Scale columns. */
+			}
+		    break;
+		case BOTH:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j)
+		    {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
+			    {
+				icol = colind[i];
+			        a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
+			    }
+			++irow;
+		    }
+		    break;
+		}
+	    } else { /* Compute R & C from scratch */
+		/* Compute the row and column scalings. */
+		pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+		
+		if ( iinfo > 0 ) {
+		    if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		    } else {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+		    }
+		} else if ( iinfo < 0 ) return;
+
+		/* Now iinfo == 0 */
+
+		/* Equilibrate matrix A if it is badly-scaled.
+		   A <-- diag(R)*A*diag(C)                     */
+		pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed);
+		
+		if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+		} else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+		} else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+		} else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+		if (iam==0) {
+		    printf (".. equilibrated? *equed = %c\n", *equed);
+		    fflush(stdout);
+		}
+#endif
+	    } /* end if-else Fact ... */
+
+	    stat->utime[EQUIL] = SuperLU_timer_ () - t;
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Exit equil");
+#endif
+	} /* end if Equil ... LAPACK style, not involving MC64 */
+
+	if ( !factored ) { /* Skip this if already factored. */
+	    /*
+	     * Gather A from the distributed compressed row format to
+	     * global A in compressed column format.
+	     * Numerical values are gathered only when a row permutation
+	     * for large diagonal is sought after.
+	     */
+	    if (Fact != SamePattern_SameRowPerm &&
+		(parSymbFact == NO || options->RowPerm != NO)) {
+		    
+		need_value = (options->RowPerm == LargeDiag_MC64);
+		
+		pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA);
+		
+		GAstore = (NCformat *) GA.Store;
+		colptr = GAstore->colptr;
+		rowind = GAstore->rowind;
+		nnz = GAstore->nnz;
+		GA_mem_use = (nnz + n + 1) * sizeof (int_t);
+		
+		if (need_value) {
+		    a_GA = (double *) GAstore->nzval;
+		    GA_mem_use += nnz * sizeof (double);
+		}
+
+		else
+		    assert (GAstore->nzval == NULL);
+	    }
+	    
+	    /* ------------------------------------------------------------
+	       Find the row permutation for A.
+	       ------------------------------------------------------------ */
+	    if (options->RowPerm != NO) {
+		t = SuperLU_timer_ ();
+		if (Fact != SamePattern_SameRowPerm) {
+		    if (options->RowPerm == MY_PERMR) {
+			/* Use user's perm_r. */
+			/* Permute the global matrix GA for symbfact() */
+			for (i = 0; i < colptr[n]; ++i) {
+			    irow = rowind[i];
+			    rowind[i] = perm_r[irow];
+			}
+		    } else if ( options->RowPerm == LargeDiag_MC64 ) {
+			/* Get a new perm_r[] */
+			if (job == 5) {
+			    /* Allocate storage for scaling factors. */
+			    if (!(R1 = doubleMalloc_dist (m)))
+				ABORT ("SUPERLU_MALLOC fails for R1[]");
+			    if (!(C1 = doubleMalloc_dist (n)))
+				ABORT ("SUPERLU_MALLOC fails for C1[]");
+			}
+			
+			if ( iam==0 ) {
+			    /* Process 0 finds a row permutation */
+			    iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA,
+						  perm_r, R1, C1);
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm);
+				}
+			    }
+			} else {
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm);
+				}
+			    }
+			}
+
+			if ( iinfo && job == 5) { /* Error return */
+			    SUPERLU_FREE(R1);
+			    SUPERLU_FREE(C1);
+			}
+#if ( PRNTlevel>=2 )
+			dmin = damch_dist ("Overflow");
+			dsum = 0.0;
+			dprod = 1.0;
+#endif
+			if ( iinfo == 0 ) {
+			    if (job == 5) {
+				if ( Equil ) {
+				    for (i = 0; i < n; ++i) {
+					R1[i] = exp (R1[i]);
+					C1[i] = exp (C1[i]);
+				    }
+					
+				    /* Scale the distributed matrix further. 
+				       A <-- diag(R1)*A*diag(C1)            */
+				    irow = fst_row;
+				    for (j = 0; j < m_loc; ++j) {
+					for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+					    icol = colind[i];
+			                    a[i] *= R1[irow] * C1[icol];
+#if ( PRNTlevel>=2 )
+					    if (perm_r[irow] == icol) {
+						/* New diagonal */
+						if (job == 2 || job == 3)
+				                  dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+						else if (job == 4)
+				                  dsum += fabs(a[i]);
+						else if (job == 5)
+				                  dprod *= fabs(a[i]);
+					    }
+#endif
+					}
+					++irow;
+				    }
+
+				    /* Multiply together the scaling factors --
+				       R/C from simple scheme, R1/C1 from MC64. */
+				    if (rowequ)
+					for (i = 0; i < m; ++i) R[i] *= R1[i];
+				    else
+					for (i = 0; i < m; ++i) R[i] = R1[i];
+				    if (colequ)
+					for (i = 0; i < n; ++i) C[i] *= C1[i];
+				    else
+					for (i = 0; i < n; ++i) C[i] = C1[i];
+
+				    ScalePermstruct->DiagScale = BOTH;
+				    rowequ = colequ = 1;
+
+				}  /* end if Equil */
+
+				/* Now permute global A to prepare for symbfact() */
+				for (j = 0; j < n; ++j) {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }
+				}
+				SUPERLU_FREE (R1);
+				SUPERLU_FREE (C1);
+			    } else { /* job = 2,3,4 */
+				for (j = 0; j < n; ++j)  {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i)
+				    {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }   /* end for i ... */
+				}       /* end for j ... */
+			    }           /* end else job ... */
+			} else { /* if iinfo != 0 */
+			    for (i = 0; i < m; ++i) perm_r[i] = i;
+			}
+#if ( PRNTlevel>=2 )
+			if (job == 2 || job == 3) {
+			    if (!iam)
+				printf ("\tsmallest diagonal %e\n", dmin);
+			} else if (job == 4) {
+			    if (!iam)
+				printf ("\tsum of diagonal %e\n", dsum);
+			} else if (job == 5) {
+			    if (!iam)
+				printf ("\t product of diagonal %e\n", dprod);
+			}
+#endif
+		    } else { /* use LargeDiag_HWPM */
+#ifdef HAVE_COMBBLAS
+		        d_c2cpp_GetHWPM(A, grid, ScalePermstruct);
+#else
+			if ( iam == 0 ) {
+			    printf("CombBLAS is not available\n"); fflush(stdout);
+			}
+#endif
+		    } /* end if-else options->RowPerm ... */
+
+		    t = SuperLU_timer_ () - t;
+		    stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+			printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+			fflush(stdout);
+		    }
+#endif
+		} /* end if Fact not SamePattern_SameRowPerm ... */
+	    } else { /* options->RowPerm == NOROWPERM / NATURAL */
+		for (i = 0; i < m; ++i)	perm_r[i] = i;
+	    }
+	    
+#if ( DEBUGlevel>=2 )
+	    if (!iam)
+		PrintInt10 ("perm_r", m, perm_r);
+#endif
+	} /* end if (!factored) */
+
+	if ( !factored || options->IterRefine ) {
+	    /* Compute norm(A), which will be used to adjust small diagonal. */
+	    if (notran)
+		*(unsigned char *) norm = '1';
+	    else
+		*(unsigned char *) norm = 'I';
+	    anorm = pdlangs (norm, A, grid);
+#if ( PRNTlevel>=1 )
+	    if (!iam) {
+		printf (".. anorm %e\n", anorm); fflush(stdout);
+	    }
+#endif
+	}
+	
+	
+	/* ------------------------------------------------------------
+	   Perform the LU factorization.
+	   ------------------------------------------------------------ */
+	if ( !factored ) {
+	    t = SuperLU_timer_ ();
+	    /*
+	     * Get column permutation vector perm_c[], according to permc_spec:
+	     *   permc_spec = NATURAL:  natural ordering
+	     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	     *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	     *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	     */
+	    permc_spec = options->ColPerm;
+
+	    if (parSymbFact == YES || permc_spec == PARMETIS) {
+		nprocs_num = grid->nprow * grid->npcol;
+		noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num))));
+		
+		/* create a new communicator for the first noDomains
+		   processes in grid->comm */
+		key = iam;
+		if (iam < noDomains)
+		    col = 0;
+		else
+		    col = MPI_UNDEFINED;
+		MPI_Comm_split (grid->comm, col, key, &symb_comm);
+		
+		if (permc_spec == NATURAL || permc_spec == MY_PERMC) {
+		    if (permc_spec == NATURAL)
+			{
+			    for (j = 0; j < n; ++j)
+				perm_c[j] = j;
+			}
+		    if (!(sizes = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for sizes.");
+		    if (!(fstVtxSep = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for fstVtxSep.");
+		    for (i = 0; i < 2 * noDomains - 2; ++i) {
+			sizes[i] = 0;
+			fstVtxSep[i] = 0;
+		    }
+		    sizes[2 * noDomains - 2] = m;
+		    fstVtxSep[2 * noDomains - 2] = 0;
+		} else if (permc_spec != PARMETIS) {
+		    /* same as before */
+		    printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n",
+			 (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid));
+		}
+	    } /* end ... use parmetis */
+
+	    if (permc_spec != MY_PERMC && Fact == DOFACT) {
+		if (permc_spec == PARMETIS) {
+	        /* Get column permutation vector in perm_c.                   *
+		 * This routine takes as input the distributed input matrix A *
+		 * and does not modify it.  It also allocates memory for      *
+		 * sizes[] and fstVtxSep[] arrays, that contain information   *
+		 * on the separator tree computed by ParMETIS.                */
+		    flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num,
+						  noDomains, &sizes, &fstVtxSep,
+						  grid, &symb_comm);
+		    if (flinfo > 0)
+			ABORT ("ERROR in get perm_c parmetis.");
+		} else {
+		    get_perm_c_dist (iam, permc_spec, &GA, perm_c);
+		}
+	    }
+
+	    stat->utime[COLPERM] = SuperLU_timer_ () - t;
+	    
+	    /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	       (a.k.a. column etree), depending on the choice of ColPerm.
+	       Adjust perm_c[] to be consistent with a postorder of etree.
+	       Permute columns of A to form A*Pc'. */
+	    if (Fact != SamePattern_SameRowPerm) {
+		if (parSymbFact == NO) {
+			    
+		    int_t *GACcolbeg, *GACcolend, *GACrowind;
+		    
+		    sp_colorder (options, &GA, perm_c, etree, &GAC);
+		    
+		    /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
+		    GACstore = (NCPformat *) GAC.Store;
+		    GACcolbeg = GACstore->colbeg;
+		    GACcolend = GACstore->colend;
+		    GACrowind = GACstore->rowind;
+		    for (j = 0; j < n; ++j) {
+			for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+			    irow = GACrowind[i];
+			    GACrowind[i] = perm_c[irow];
+			}
+		    }
+		    
+		    /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
+		       the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 )
+		    if (!iam)
+			printf
+			    (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
+			     sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+		    t = SuperLU_timer_ ();
+		    if (!(Glu_freeable = (Glu_freeable_t *)
+			  SUPERLU_MALLOC (sizeof (Glu_freeable_t))))
+			ABORT ("Malloc fails for Glu_freeable.");
+
+		    /* Every process does this. */
+		    iinfo = symbfact (options, iam, &GAC, perm_c, etree,
+				      Glu_persist, Glu_freeable);
+
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (iinfo < 0) {
+			/* Successful return */
+			QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage);
+
+#if ( PRNTlevel>=1 )
+			if (!iam) {
+			    printf ("\tNo of supers %ld\n",
+				    (long) Glu_persist->supno[n - 1] + 1);
+			    printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]);
+			    printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]);
+			    printf ("\tint %lu, short %lu, float %lu, double %lu\n",
+				    sizeof(int_t), sizeof (short),
+				    sizeof(float), sizeof (double));
+			    printf
+				("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+				 symb_mem_usage.for_lu * 1e-6,
+				 symb_mem_usage.total * 1e-6,
+				 symb_mem_usage.expansions);
+			}
+#endif
+		    } else {
+			if (!iam) {
+			    fprintf (stderr, "symbfact() error returns %d\n",
+				     (int) iinfo);
+			    exit (-1);
+			}
+		    } 
+
+		} /* end serial symbolic factorization */
+		else { /* parallel symbolic factorization */
+		    t = SuperLU_timer_ ();
+		    flinfo =
+			symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable,
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage);
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (flinfo > 0)
+			ABORT
+			    ("Insufficient memory for parallel symbolic factorization.");
+		}
+
+		/* Destroy GA */
+		if (parSymbFact == NO || options->RowPerm != NO)
+		    Destroy_CompCol_Matrix_dist (&GA);
+		if (parSymbFact == NO)
+		    Destroy_CompCol_Permuted_dist (&GAC);
+		
+	    } /* end if Fact not SamePattern_SameRowPerm */
+
+	    if (sizes)
+		SUPERLU_FREE (sizes);
+	    if (fstVtxSep)
+		SUPERLU_FREE (fstVtxSep);
+	    if (symb_comm != MPI_COMM_NULL)
+		MPI_Comm_free (&symb_comm);
+	    
+	    if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		t = SuperLU_timer_ ();
+		dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct,
+					     Glu_freeable, LUstruct, grid);
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+		
+		/* Deallocate storage used in symbolic factorization. */
+		if (Fact != SamePattern_SameRowPerm)
+		    {
+			iinfo = symbfact_SubFree (Glu_freeable);
+			SUPERLU_FREE (Glu_freeable);
+		    }
+	    } else {
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		t = SuperLU_timer_ ();
+		dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct,
+						 &Pslu_freeable, LUstruct, grid);
+		if (dist_mem_use > 0)
+		    ABORT ("Not enough memory available for dist_psymbtonum\n");
+
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+	    }
+
+	    /*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+	} /* end if not Factored */
+    } /* end if process layer 0 */
+
+    trf3Dpartition_t*  trf3Dpartition;
+
+    /* Perform numerical factorization in parallel on all process layers.*/
+    if ( !factored ) {
+
+	/* send the data across all the layers */
+	MPI_Bcast( &m, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &n, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+	
+	/* send the LU structure to all the grids */
+	dp3dScatter(n, LUstruct, grid3d);
+
+	int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+	trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+
+	SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t));
+	SCT_init(SCT);
+	
+#if ( PRNTlevel>=1 )
+	if (grid3d->iam == 0) {
+	    printf("after 3D initialization.\n"); fflush(stdout);
+	}
+#endif
+
+	t = SuperLU_timer_ ();
+
+	/*factorize in grid 1*/
+	// if(grid3d->zscp.Iam)
+
+	pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+		   grid3d, stat, info);
+	stat->utime[FACT] = SuperLU_timer_ () - t;
+	
+	double tgather = SuperLU_timer_();
+	
+	dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT);
+
+	SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+	/*print stats for bottom grid*/
+	
+#if ( PRNTlevel>=1 )
+	if (!grid3d->zscp.Iam)
+	    {
+		SCT_print(grid, SCT);
+		SCT_print3D(grid3d, SCT);
+	    }
+	SCT_printComm3D(grid3d, SCT);
+
+	/*print memory usage*/
+	d3D_printMemUse( trf3Dpartition, LUstruct, grid3d );
+
+	/*print forest weight and costs*/
+	printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
+	/*reduces stat from all the layers*/
+#endif
+
+        dDestroy_trf3Dpartition(trf3Dpartition, grid3d);
+	SCT_free(SCT);
+
+    } /* end if not Factored ... factor on all process layers */
+    
+    if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
+	if (!factored) {
+	    if (options->PrintStat) {
+		int_t TinyPivots;
+		float for_lu, total, max, avg, temp;
+		
+		dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage);
+		
+		if (parSymbFact == TRUE) {
+		    /* The memory used in the redistribution routine
+		       includes the memory used for storing the symbolic
+		       structure and the memory allocated for numerical factorization */
+		    temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use);
+		    if (options->RowPerm != NO)
+			temp = SUPERLU_MAX (temp, GA_mem_use);
+		}
+		else {
+		    temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use,  /* symbfact step */
+					symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */
+					);
+		}
+
+		temp = SUPERLU_MAX (temp, num_mem_usage.total);
+
+		MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			       MPI_SUM, grid->comm);
+		stat->TinyPivots = TinyPivots;
+
+		MPI_Reduce (&num_mem_usage.for_lu, &for_lu,
+			    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&num_mem_usage.total, &total,
+				            1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+
+		if (!iam) {
+		    printf("\tNUMfact space (MB) sum(procs):  L\\U\t%.2f\tall\t%.2f\n",
+			   for_lu * 1e-6, total * 1e-6);
+		    printf ("\tTotal highmark (MB):  "
+			    "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6,
+			    avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6);
+		    printf("**************************************************\n");
+		    fflush(stdout);
+		}
+	    }
+	    
+	}   /* end if not Factored */
+
+	/* ------------------------------------------------------------
+	   Compute the solution matrix X.
+	   ------------------------------------------------------------ */
+	if ( (nrhs > 0) && (*info == 0) ) {
+	    if (!(b_work = doubleMalloc_dist (n)))
+	        ABORT ("Malloc fails for b_work[]");
+
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+	        {
+	    	    if (rowequ)
+		        {
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+			        {
+			    	    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
+				    {
+		                         b_col[i] *= R[irow];
+ 					 ++irow;
+				    }
+				    b_col += ldb;
+				}
+			}
+		    }
+		else if (colequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				{
+		                    b_col[i] *= C[irow];
+				    ++irow;
+				}
+				b_col += ldb;
+			    }
+		    }
+
+		/* Save a copy of the right-hand side. */
+		ldx = ldb;
+		if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs)))
+		    ABORT ("Malloc fails for X[]");
+		x_col = X;
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+		    x_col += ldx;
+		    b_col += ldb;
+		}
+
+		/* ------------------------------------------------------
+		   Solve the linear system.
+		   ------------------------------------------------------*/
+		if (options->SolveInitialized == NO) /* First time */
+                   /* Inside this routine, SolveInitialized is set to YES.
+	              For repeated call to pdgssvx3d(), no need to re-initialilze
+	              the Solve data & communication structures, unless a new
+	              factorization with Fact == DOFACT or SamePattern is asked for. */
+		    {
+			dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
+			            grid, SOLVEstruct);
+		    }
+		stat->utime[SOLVE] = 0.0;
+#if 0 // Sherry: the following interface is needed by 3D trisolve.
+		pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
+				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#else
+		pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+			fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#endif
+
+		/* ------------------------------------------------------------
+		   Use iterative refinement to improve the computed solution and
+		   compute error bounds and backward error estimates for it.
+		   ------------------------------------------------------------ */
+		if (options->IterRefine)
+		    {
+			/* Improve the solution by iterative refinement. */
+			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+			dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+			t = SuperLU_timer_ ();
+			if (options->RefineInitialized == NO || Fact == DOFACT) {
+			    /* All these cases need to re-initialize gsmv structure */
+			    if (options->RefineInitialized)
+				pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+			    pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+					 SOLVEstruct->gsmv_comm);
+
+			    /* Save a copy of the transformed local col indices
+			       in colind_gsmv[]. */
+			    if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+			    if (!(it = intMalloc_dist (nnz_loc)))
+				ABORT ("Malloc fails for colind_gsmv[]");
+			    colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+			    for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+			    options->RefineInitialized = YES;
+			}
+			else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+			    double at;
+			    int_t k, jcol, p;
+			    /* Swap to beginning the part of A corresponding to the
+			       local part of X, as was done in pdgsmv_init() */
+			    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+				k = rowptr[i];
+				for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+				    {
+					jcol = colind[j];
+					p = SOLVEstruct->row_to_proc[jcol];
+					if (p == iam)
+					    {	/* Local */
+						at = a[k];
+						a[k] = a[j];
+						a[j] = at;
+						++k;
+					    }
+				    }
+			    }
+			    
+			    /* Re-use the local col indices of A obtained from the
+			       previous call to pdgsmv_init() */
+			    for (i = 0; i < nnz_loc; ++i)
+				colind[i] = colind_gsmv[i];
+			}
+			
+			if (nrhs == 1)
+			    {	/* Use the existing solve structure */
+				SOLVEstruct1 = SOLVEstruct;
+			    }
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
+				if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+				      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+				    ABORT ("Malloc fails for SOLVEstruct1");
+				/* Copy the same stuff */
+				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+				SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+				SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+				SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+				SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+				SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+				SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				
+				/* Initialize the *gstrs_comm for 1 RHS. */
+				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+				    ABORT ("Malloc fails for gstrs_comm[]");
+				pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+					      Glu_persist, SOLVEstruct1);
+			    }
+			
+			pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
+				 B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+			
+			/* Deallocate the storage associated with SOLVEstruct1 */
+			if (nrhs > 1)
+			    {
+				pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+				SUPERLU_FREE (SOLVEstruct1);
+			    }
+			
+			stat->utime[REFINE] = SuperLU_timer_ () - t;
+		    } /* end IterRefine */
+		
+		/* Permute the solution matrix B <= Pc'*X. */
+		pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+#endif
+		
+		/* Transform the solution matrix X to a solution of the original
+		   system before the equilibration. */
+		if (notran)
+		    {
+			if (colequ)
+			    {
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				    {
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					    {
+						b_col[i] *= C[irow];
+						++irow;
+					    }
+					b_col += ldb;
+				    }
+			    }
+		    }
+		else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					b_col[i] *= R[irow];
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
+		
+		SUPERLU_FREE (b_work);
+		SUPERLU_FREE (X);
+		
+	    } /* end if nrhs > 0 and factor successful */
+	
+#if ( PRNTlevel>=1 )
+	if (!iam) {
+	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
+#endif
+	
+	/* Deallocate R and/or C if it was not used. */
+	if (Equil && Fact != SamePattern_SameRowPerm)
+	    {
+		switch (ScalePermstruct->DiagScale) {
+		    case NOEQUIL:
+			SUPERLU_FREE (R);
+			SUPERLU_FREE (C);
+			break;
+		    case ROW:
+			SUPERLU_FREE (C);
+			break;
+		    case COL:
+			SUPERLU_FREE (R);
+			break;
+	            default: break;
+		}
+	    }
+	
+#if 0
+	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+	    Destroy_CompCol_Permuted_dist (&GAC);
+#endif
+
+    } /* process layer 0 done solve */
+
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) dScatter_B3d(A3d, grid3d);
+
+    B = A3d->B3d; // B is now assigned back to B3d on return
+    A->Store = Astore3d; // restore Astore to 3D
+    
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC (iam, "Exit pdgssvx3d()");
+#endif
+
+}
diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c
index f988bf94..14cf4f2d 100644
--- a/SRC/pdgssvx_ABglobal.c
+++ b/SRC/pdgssvx_ABglobal.c
@@ -588,6 +588,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
 		break;
+	    default: break;
 	}
     }
 
@@ -876,7 +877,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 	if ( Fact != SamePattern_SameRowPerm ) {
 #if ( PRNTlevel>=1 )
 	    if ( !iam )
-		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
 		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 #endif
 	    t = SuperLU_timer_();
@@ -899,7 +900,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		    printf("\tint %d, short %d, float %d, double %d\n",
 			   (int) sizeof(int_t), (int) sizeof(short),
  			   (int) sizeof(float), (int) sizeof(double));
-		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
 			   symb_mem_usage.for_lu*1e-6,
 			   symb_mem_usage.total*1e-6,
 			   symb_mem_usage.expansions);
@@ -1098,6 +1099,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default:  break;
 	}
     }
     if ( !factored || (factored && options->IterRefine) )
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index d15d31f9..9b2d056e 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -109,7 +109,6 @@ at the top-level directory.
  */
 
 #include <math.h>
-/*#include "mkl.h"*/
 #include "superlu_ddefs.h"
 #include "gpublas_utils.h"
 #ifdef GPU_ACC
@@ -128,7 +127,7 @@ at the top-level directory.
 // #define SUPERNODE_PROFILE
 
 /*
-    Name    :   BAELINE
+    Name    :   BASELINE
     Purpose : baseline to compare performance against
     Overhead : NA : this won't be used for running experiments
 */
@@ -769,11 +768,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
 
-#ifdef GPU_ACC
+#ifdef GPU_ACC /*-------- use GPU --------*/
     int gpublas_nb = get_gpublas_nb(); // default 64
-    int nstreams = get_num_gpu_streams ();
+    int nstreams = get_num_gpu_streams (); // default 8
 
-    int buffer_size  = SUPERLU_MAX(max_row_size*nstreams*gpublas_nb,get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
+                                     //   get_max_buffer_size());
     /* array holding last column blk for each partition,
        used in SchCompUdt--GPU.c         */
   #if 0
@@ -785,8 +785,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else /* not to use GPU */
 
     int Threads_per_process = get_thread_per_process();
-    int_t buffer_size  = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size());
-#endif /* end ifdef GPU_ACC */
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8));
+                                     // get_max_buffer_size());
+#endif /* end ifdef GPU_ACC -----------*/
 
     int_t max_ncols = 0;
 #if 0
@@ -813,8 +814,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigV = NULL;
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
-	     		  max_row_size, max_ncols);
+    if(!iam) {
+        printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8));
+	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
+        printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
+                max_row_size, max_ncols);
+    }
     printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
     fflush(stdout);
 #endif
@@ -829,16 +834,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+            iam, bigv_size, buffer_size);
     fflush(stdout);
 #endif
     if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(double) ,gpuHostMallocDefault)) )
         ABORT("Malloc fails for dgemm buffer V");
 
-    if ( iam==0 )DisplayHeader();
-
 #if ( PRNTlevel>=1 )
-    printf(" Starting with %d GPU Streams \n",nstreams );
+    if ( iam==0 ) {
+        DisplayHeader();
+	printf(" Starting with %d GPU Streams \n",nstreams );
+        fflush(stdout);
+    }
 #endif
 
     gpublasHandle_t *handle;
@@ -881,10 +889,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         return 1;
     }
 
-    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
-			  + bigu_size + buffer_size ) * dword;
+    stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA
+                                 + bigu_size                    // dB
+                                 + buffer_size );               // dC
 
-#else   /*-- not to use GPU --*/
+#else  /*-------- not to use GPU --------*/
 
     // for GEMM padding 0
     j = bigu_size / ldt;
@@ -892,7 +901,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
     fflush(stdout);
 #endif
 
@@ -906,7 +915,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         ABORT ("Malloc failed for dgemm V buffer");
 //#endif
 
-#endif /* end ifdef GPU_ACC */
+#endif 
+/*************** end ifdef GPU_ACC ****************/
 
     log_memory((bigv_size + bigu_size) * dword, stat);
 
@@ -1759,29 +1769,29 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
 	       0, grid->comm);
     if ( iam==0 ) {
-	printf("\nInitialization time\t%8.2lf seconds\n"
+	printf("\nInitialization time\t%8.4lf seconds\n"
 	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
         printf("\n==== Time breakdown in factorization (rank 0) ====\n");
-	printf("Panel factorization \t %8.2lf seconds\n",
+	printf("Panel factorization \t %8.4lf seconds\n",
 	       pdgstrf2_timer + pdgstrs2_timer);
-	printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
-	printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
-	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
-        printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
-        printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
-        printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
-
-        printf(".. Time in GEMM %8.2lf \n",
+	printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer);
+	printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer);
+	printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer);
+        printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer);
+        printf(".. Time to Gather L buffer\t %8.4lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
+        printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer);
+
+        printf(".. Time in GEMM %8.4lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
-        printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer);
+        printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n",
 	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
-        printf(".. Time to Scatter %8.2lf \n",
+        printf(".. Time to Scatter %8.4lf \n",
 	       LookAheadScatterTimer + RemainScatterTimer);
-        printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
-        printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer);
+        printf("\t* Remain\t %8.4lf \n", RemainScatterTimer);
 
-        printf("Total factorization time            \t: %8.2lf seconds, \n", pxgstrfTimer);
+        printf("Total factorization time            \t: %8.4lf seconds, \n", pxgstrfTimer);
         printf("--------\n");
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 168fa750..8c5a1933 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -213,9 +213,8 @@ pdgstrf2_trsm
         for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
             /* Diagonal pivot */
             i = luptr;
-            /* Not to replace zero pivot.  */
-            // if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0 )  {
-            if (options->ReplaceTinyPivot == YES)  {
+            /* May replace zero pivot.  */
+            if (options->ReplaceTinyPivot == YES )  {
                 if (fabs (lusup[i]) < thresh) {  /* Diagonal */
 
 #if ( PRNTlevel>=2 )
@@ -363,44 +362,35 @@ pdgstrf2_trsm
 }  /* PDGSTRF2_trsm */
 
 
-#if 0 /* COMMENT OUT 3D CODE FOR NOW */
 
 /*****************************************************************************
  * The following functions are for the new pdgstrf2_dtrsm in the 3D code.
  *****************************************************************************/
 static
-int_t LpanelUpdate(int_t off0,  int_t nsupc, double* ublk_ptr, int_t ld_ujrow,
-                   double* lusup, int_t nsupr, SCT_t* SCT)
+int_t LpanelUpdate(int off0,  int nsupc, double* ublk_ptr, int ld_ujrow,
+                   double* lusup, int nsupr, SCT_t* SCT)
 {
     int_t l = nsupr - off0;
     double alpha = 1.0;
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
 
 #define GT  32
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < CEILING(l, GT); ++i)
     {
         int_t off = i * GT;
-        int_t len = SUPERLU_MIN(GT, l - i * GT);
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
-		1, 1, 1, 1);
-  #else
-        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
-  #endif
-#else
-        cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                     len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
-#endif
+        int len = SUPERLU_MIN(GT, l - i * GT);
+	
+        superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+		      ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
 
     } /* for i = ... */
 
-    t1 = _rdtsc() - t1;
+    t1 = SuperLU_timer_() - t1;
 
-    SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc;
+    SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc;
     SCT->trf2_time += t1;
     SCT->L_PanelUpdate_tl += t1;
     return 0;
@@ -408,13 +398,30 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, double* ublk_ptr, int_t ld_ujrow,
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
-/*factorizes the diagonal block; called from process that owns the (k,k) block*/
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    double *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, dLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     double alpha = -1, zero = 0.0;
 
@@ -424,8 +431,8 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     int_t jfst = FstBlockC (k);
     int_t jlst = FstBlockC (k + 1);
     double *lusup = Llu->Lnzval_bc_ptr[lk];
-    int_t nsupc = SuperSize (k);
-    int_t nsupr;
+    int nsupc = SuperSize (k);
+    int nsupr;
     if (Llu->Lrowind_bc_ptr[lk])
         nsupr = Llu->Lrowind_bc_ptr[lk][1];
     else
@@ -433,18 +440,19 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     double *ublk_ptr = BlockUFactor;
     double *ujrow = BlockUFactor;
     int_t luptr = 0;                  /* Point_t to the diagonal entries. */
-    int_t cols_left = nsupc;          /* supernode size */
+    int cols_left = nsupc;          /* supernode size */
     int_t u_diag_cnt = 0;
     int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
-    int_t incx = 1;
-    int_t incy = ld_ujrow;
+    int incx = 1;
+    int incy = ld_ujrow;
 
     for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
     {
         /* Diagonal pivot */
         int_t i = luptr;
-        /* Not to replace zero pivot.  */
-        if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
         {
             if (fabs (lusup[i]) < thresh) {  /* Diagonal */
 
@@ -485,17 +493,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
         if (--cols_left)
         {
             /*following must be int*/
-            int_t l = nsupc - j - 1;
+            int l = nsupc - j - 1;
 
 	    /* Rank-1 update */
-#if 1
-	    dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
-		   &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], &nsupr);
-#else
-            cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx,
-                        &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
-                        nsupr);
-#endif
+            superlu_dger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
             stat->ops[FACT] += 2 * l * cols_left;
         }
 
@@ -506,8 +508,8 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
 
     //int_t thread_id = omp_get_thread_num();
-    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1);
-}
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Dgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
@@ -712,7 +714,7 @@ int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 
 int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 			      int_t *usub, double *uval, double *tempv,
-			      int_t knsupc, int_t nsupr, double *lusup,
+			      int_t knsupc, int nsupr, double *lusup,
 			      Glu_persist_t *Glu_persist)    /*glupersist for xsup for supersize*/
 {
     double alpha = 1.0;
@@ -725,34 +727,22 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 
     // printf("klst inside task%d\n", );
     /*find ldu */
-    int_t ldu = 0;
+    int ldu = 0;
     for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
     {
         ldu = SUPERLU_MAX( klst - usub[jj], ldu) ;
     }
 
     /*pack U block into a dense Block*/
-    int_t ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
+    int ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
     	                           uval, tempv);
 
     /*now call dtrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
-
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu,
-	     1, 1, 1, 1);
-  #else
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu);
-  #endif
-#else
-
-    cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
-                 ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu);
-#endif
+    
+    superlu_dtrsm("L", "L", "N", "U", ldu, ncols, alpha,
+		  &lusup[luptr], nsupr, tempv, ldu);
 
     /*now scatter the output into sparse U block*/
     dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
@@ -760,10 +750,9 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 }
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
-#if 1 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -845,8 +834,10 @@ void pdgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -889,7 +880,9 @@ void pdgstrs2_omp
 #endif
 	    } /* end if segsize > 0 */
 	} /* end for j in parallel ... */
+#ifdef _OPENMP    
 /* #pragma omp taskwait */
+#endif
     }  /* end for b ... */
 
 #ifndef USE_Ublock_info
@@ -912,7 +905,7 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, dLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -933,20 +926,22 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         double *tempv = bigV +  thread_id * ldt * ldt;
         dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 } /* pdgstrs2_omp new version from Piyush */
 
-#endif
+#endif /* there are 2 versions of pdgstrs2_omp */
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
new file mode 100644
index 00000000..27fe6bfe
--- /dev/null
+++ b/SRC/pdgstrf3d.c
@@ -0,0 +1,392 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "dlustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) dLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
+		trf3Dpartition_t*  trf3Dpartition, SCT_t *SCT,
+		dLUstruct_t *LUstruct, gridinfo3d_t * grid3d,
+		SuperLUStat_t *stat, int *info)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    // problem specific contants
+    int_t ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    //    double s_eps = slamch_ ("Epsilon");  -Sherry
+    double s_eps = smach_dist("Epsilon");
+    double thresh = s_eps * anorm;
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()");
+#endif
+
+    // Initilize stat
+    stat->ops[FACT] = 0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+    //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
+
+    //getting Nsupers
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+
+    // Grid related Variables
+    int_t iam = grid->iam; // in 2D grid
+    int num_threads = getNumThreads(grid3d->iam);
+
+    factStat_t factStat;
+    initFactStat(nsupers, &factStat);
+
+#if 0  // sherry: not used
+    ddiagFactBufs_t dFBuf;
+    dinitDiagFactBufs(ldt, &dFBuf);
+
+    commRequests_t comReqs;   
+    initCommRequests(&comReqs, grid);
+
+    msgs_t msgs;
+    initMsgs(&msgs);
+#endif
+
+    SCT->tStartup = SuperLU_timer_();
+    packLUInfo_t packLUInfo;
+    initPackLUInfo(nsupers, &packLUInfo);
+
+    dscuBufs_t scuBufs;
+    dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid);
+
+    factNodelists_t  fNlists;
+    initFactNodelists( ldt, num_threads, nsupers, &fNlists);
+
+    // tag_ub initialization
+    int tag_ub = set_tag_ub();
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+#if ( PRNTlevel>=1 )
+    if (grid3d->iam == 0) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
+    // trf3Dpartition_t*  trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+    gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
+    int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno;
+    int_t* myNodeCount = trf3Dpartition->myNodeCount;
+    int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    int_t** treePerm = trf3Dpartition->treePerm ;
+    dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+
+    /* Initializing factorization specific buffers */
+
+    int_t numLA = getNumLookAhead(options);
+    dLUValSubBuf_t** LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
+    msgs_t**msgss = initMsgsArr(numLA);
+    int_t mxLeafNode    = 0;
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
+        if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
+            mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
+    }
+    ddiagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid);
+    commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid);
+
+    /* Setting up GPU related data structures */
+
+    int_t first_l_block_acc = 0;
+    int_t first_u_block_acc = 0;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
+
+    dInit_HyP(HyP, Llu, mcb, mrb);
+    HyP->first_l_block_acc = first_l_block_acc;
+    HyP->first_u_block_acc = first_u_block_acc;
+
+    int superlu_acc_offload = HyP->superlu_acc_offload;
+
+    //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t bigu_size = getBigUSize(nsupers, grid,
+    	  	                  LUstruct->Llu->Lrowind_bc_ptr);
+    HyP->bigu_size = bigu_size;
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
+    HyP->buffer_size = buffer_size;
+    HyP->nsupers = nsupers;
+
+#ifdef GPU_ACC
+
+    /*Now initialize the GPU data structure*/
+    dLUstruct_gpu_t *A_gpu, *dA_gpu;
+
+    d2Hreduce_t d2HredObj;
+    d2Hreduce_t* d2Hred = &d2HredObj;
+    dsluGPU_t sluGPUobj;
+    dsluGPU_t *sluGPU = &sluGPUobj;
+    sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
+    if (superlu_acc_offload)
+    {
+#if 0 	/* Sherry: For GPU code on titan, we do not need performance 
+	   lookup tables since due to difference in CPU-GPU performance,
+	   it didn't make much sense to do any Schur-complement update
+	   on CPU, except for the lookahead-update on CPU. Same should
+	   hold for summit as well. (from Piyush)   */
+
+        /*Initilize the lookup tables */
+        LookUpTableInit(iam);
+        acc_async_cost = get_acc_async_cost();
+#ifdef GPU_DEBUG
+        if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost);
+#endif
+#endif
+
+	//OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+	int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+					      LUstruct->etree,
+					      LUstruct->Glu_persist,
+					      LUstruct->Llu->Lrowind_bc_ptr,
+					      LUstruct->Llu->Ufstnz_br_ptr,
+					      grid);
+
+	/* Initialize GPU data structures */
+        dinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno,
+                        n, buffer_size, bigu_size, ldt);
+
+        HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu;
+        HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu;
+        HyP->nGPUStreams = sluGPU->nGPUStreams;
+    }
+
+#endif  // end GPU_ACC
+
+    /*====  starting main factorization loop =====*/
+    MPI_Barrier( grid3d->comm);
+    SCT->tStartup = SuperLU_timer_() - SCT->tStartup;
+    // int_t myGrid = grid3d->zscp.Iam;
+
+#ifdef ITAC_PROF
+    VT_traceon();
+#endif
+#ifdef MAP_PROFILE
+    allinea_start_sampling();
+#endif
+    SCT->pdgstrfTimer = SuperLU_timer_();
+
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
+    {
+        /* if I participate in this level */
+        if (!myZeroTrIdxs[ilvl])
+        {
+            //int_t tree = myTreeIdxs[ilvl];
+
+            sForest_t* sforest = sForests[myTreeIdxs[ilvl]];
+
+            /* main loop over all the supernodes */
+            if (sforest) /* 2D factorization at individual subtree */
+            {
+                double tilvl = SuperLU_timer_();
+#ifdef GPU_ACC
+                dsparseTreeFactor_ASYNC_GPU(
+                    sforest,
+                    comReqss, &scuBufs,  &packLUInfo,
+                    msgss, LUvsbs, dFBufs,  &factStat, &fNlists,
+                    &gEtreeInfo, options,  iperm_c_supno, ldt,
+                    sluGPU,  d2Hred,  HyP, LUstruct, grid3d, stat,
+                    thresh,  SCT, tag_ub, info);
+#else
+                dsparseTreeFactor_ASYNC(sforest, comReqss,  &scuBufs, &packLUInfo,
+					msgss, LUvsbs, dFBufs, &factStat, &fNlists,
+					&gEtreeInfo, options, iperm_c_supno, ldt,
+					HyP, LUstruct, grid3d, stat,
+					thresh,  SCT, tag_ub, info );
+#endif
+
+                /*now reduce the updates*/
+                SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
+                sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
+            }
+
+            if (ilvl < maxLvl - 1)     /*then reduce before factorization*/
+            {
+#ifdef GPU_ACC
+                dreduceAllAncestors3d_GPU(
+                    ilvl, myNodeCount, treePerm, LUvsb,
+                    LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP,
+                    SCT );
+#else
+
+                dreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+                                      LUvsb, LUstruct, grid3d, SCT );
+#endif
+
+            }
+        } /*if (!myZeroTrIdxs[ilvl])  ... If I participate in this level*/
+
+        SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
+	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
+
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    if (superlu_acc_offload) {
+         dfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
+#endif
+    
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
+    SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
+
+#ifdef ITAC_PROF
+    VT_traceoff();
+#endif
+
+#ifdef MAP_PROFILE
+    allinea_stop_sampling();
+#endif
+
+    reduceStat(FACT, stat, grid3d);
+
+    // sherry added
+    /* Deallocate factorization specific buffers */
+    freePackLUInfo(&packLUInfo);
+    dfreeScuBufs(&scuBufs);
+    freeFactStat(&factStat);
+    freeFactNodelists(&fNlists);
+    freeMsgsArr(numLA, msgss);
+    freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss);
+    dLluBufFreeArr(numLA, LUvsbs);
+    dfreeDiagFactBufsArr(mxLeafNode, dFBufs);
+    Free_HyP(HyP);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()");
+#endif
+    return 0;
+
+} /* pdgstrf3d */
diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c
index dc23d0cf..be43fbc1 100644
--- a/SRC/pdgstrs.c
+++ b/SRC/pdgstrs.c
@@ -1158,7 +1158,13 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct,
 	int_t cnt1,cnt2;
 
 	
-#ifdef GPU_ACC
+#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE)  /* GPU trisolve*/
+
+#if ( PRNTlevel>=1 )
+	if ( !iam) printf(".. GPU trisolve\n");
+	fflush(stdout);
+#endif
+
 
 #ifdef GPUREF
 
@@ -1546,8 +1552,7 @@ if(procs==1){
 	// fflush(stdout);
 	// }
 
-
-#ifdef GPU_ACC /* CPU trisolve*/
+#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE)  /* GPU trisolve*/
 // #if 0 /* CPU trisolve*/
 
 #ifdef GPUREF /* use cuSparse*/
@@ -2632,7 +2637,7 @@ thread_id=0;
 
 
 
-#ifdef GPU_ACC /*GPU trisolve*/
+#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE)  /* GPU trisolve*/
 // #if 0 /* CPU trisolve*/
 
 	d_grid = NULL;
diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c
index 6c9af91f..9aa84033 100644
--- a/SRC/pdgstrs_lsum.c
+++ b/SRC/pdgstrs_lsum.c
@@ -14,14 +14,15 @@ at the top-level directory.
  * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * </pre>
  */
 
@@ -528,11 +529,11 @@ void dlsum_fmod_inv
 			for (nn=0;nn<Nchunk;++nn){
 
 #ifdef _OPENMP
-				thread_id1 = omp_get_thread_num ();
+			    thread_id1 = omp_get_thread_num ();
 #else
-				thread_id1 = 0;
+			    thread_id1 = 0;
 #endif
-				rtemp_loc = &rtemp[sizertemp* thread_id1];
+			    rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
 					lbstart = nn*(nlb_loc+1);
@@ -554,43 +555,43 @@ void dlsum_fmod_inv
 						nbrow += lsub[lptr1_tmp+1];
 					}
 
-				#ifdef _CRAY
+#ifdef _CRAY
 					SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						  &knsupc, &beta, rtemp_loc, &nbrow );
-				#elif defined (USE_VENDOR_BLAS)
+#elif defined (USE_VENDOR_BLAS)
 					dgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
 						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						   &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 );
-				#else
+#else
 					dgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
 						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						   &knsupc, &beta, rtemp_loc, &nbrow );
-				#endif
+#endif
 
 					nbrow_ref=0;
 					for (lb = lbstart; lb < lbend; ++lb){
-						lptr1_tmp = lloc[lb+idx_i];
-						lptr= lptr1_tmp+2;
-						nbrow1 = lsub[lptr1_tmp+1];
-						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-						rel = xsup[ik]; /* Global row index of block ik. */
+					    lptr1_tmp = lloc[lb+idx_i];
+					    lptr= lptr1_tmp+2;
+					    nbrow1 = lsub[lptr1_tmp+1];
+					    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+					    rel = xsup[ik]; /* Global row index of block ik. */
 
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */
+					    lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-						iknsupc = SuperSize( ik );
-						il = LSUM_BLK( lk );
+					    iknsupc = SuperSize( ik );
+					    il = LSUM_BLK( lk );
 
-						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i = 0; i < nbrow1; ++i) {
-								irow = lsub[lptr+i] - rel; /* Relative row. */
-								lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow];
-							}
+					    RHS_ITERATE(j)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+						for (i = 0; i < nbrow1; ++i) {
+					   	    irow = lsub[lptr+i] - rel; /* Relative row. */
+						    lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+						}
 						nbrow_ref+=nbrow1;
-					}
+					} /* endd for lb ... */
 
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
@@ -598,114 +599,113 @@ void dlsum_fmod_inv
 #endif
 
 					for (lb=lbstart;lb<lbend;lb++){
-						lk = lloc[lb+idx_n];
+					    lk = lloc[lb+idx_n];
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-						fmod_tmp=--fmod[lk*aln_i];
+					    fmod_tmp=--fmod[lk*aln_i];
 
-						if ( fmod_tmp==0 ) { /* Local accumulation done. */
+					    if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-							lptr1_tmp = lloc[lb+idx_i];
+						lptr1_tmp = lloc[lb+idx_i];
 
-							ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-							lk = LBi( ik, grid ); /* Local block number, row-wise. */
+						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-							iknsupc = SuperSize( ik );
-							il = LSUM_BLK( lk );
+						iknsupc = SuperSize( ik );
+						il = LSUM_BLK( lk );
 
-							ikcol = PCOL( ik, grid );
-							p = PNUM( myrow, ikcol, grid );
-							if ( iam != p ) {
-								for (ii=1;ii<num_thread;ii++)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (jj=0;jj<iknsupc*nrhs;jj++)
-										lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+						ikcol = PCOL( ik, grid );
+						p = PNUM( myrow, ikcol, grid );
+						if ( iam != p ) {
+						    for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-								nleaf_send_tmp = ++nleaf_send[0];
-								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
-								// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'d');
+							nleaf_send_tmp = ++nleaf_send[0];
+							leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
+							// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'d');
 
-							} else { /* Diagonal process: X[i] += lsum[i]. */
+						} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
-								TIC(t1);
+							TIC(t1);
 #endif
-								for (ii=1;ii<num_thread;ii++)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (jj=0;jj<iknsupc*nrhs;jj++)
-										lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-
-								ii = X_BLK( lk );
-								RHS_ITERATE(j)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (i = 0; i < iknsupc; ++i)
-										x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
+							for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
+							ii = X_BLK( lk );
+							RHS_ITERATE(j)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (i = 0; i < iknsupc; ++i)
+								x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
 
-								// fmod[lk] = -1; /* Do not solve X[k] in the future. */
-								lk = LBj( ik, grid );/* Local block number, column-wise. */
-								lsub1 = Llu->Lrowind_bc_ptr[lk];
-								lusup1 = Llu->Lnzval_bc_ptr[lk];
-								nsupr1 = lsub1[1];
+							// fmod[lk] = -1; /* Do not solve X[k] in the future. */
+							lk = LBj( ik, grid );/* Local block number, column-wise. */
+							lsub1 = Llu->Lrowind_bc_ptr[lk];
+							lusup1 = Llu->Lnzval_bc_ptr[lk];
+							nsupr1 = lsub1[1];
 
-								if(Llu->inv == 1){
-									Linv = Llu->Linv_bc_ptr[lk];
+							if(Llu->inv == 1){
+								Linv = Llu->Linv_bc_ptr[lk];
 
 
 #ifdef _CRAY
-									SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc );
+								SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
 #elif defined (USE_VENDOR_BLAS)
-									dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+								dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
 #else
-									dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (i=0 ; i<iknsupc*nrhs ; i++){
-										x[ii+i] = rtemp_loc[i];
-									}
-
-								}else{
+								dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+							#ifdef _OPENMP
+							#pragma omp simd
+							#endif
+								for (i=0 ; i<iknsupc*nrhs ; i++){
+									x[ii+i] = rtemp_loc[i];
+								}
+
+							}else{
 #ifdef _CRAY
-									STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+								STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+								dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-									dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-											lusup1, &nsupr1, &x[ii], &iknsupc);
+								dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc);
 
 #endif
-								}
+							} /* end else */
 
 #if ( PROFlevel>=1 )
-								TOC(t2, t1);
-								stat[thread_id1]->utime[SOL_TRSM] += t2;
+							TOC(t2, t1);
+							stat[thread_id1]->utime[SOL_TRSM] += t2;
 
 #endif
 
-								stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+							stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
 
 #if ( DEBUGlevel>=2 )
-								printf("(%2d) Solve X[%2d]\n", iam, ik);
+							printf("(%2d) Solve X[%2d]\n", iam, ik);
 
 #endif
 
@@ -716,30 +716,30 @@ void dlsum_fmod_inv
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-									nleaf_send_tmp = ++nleaf_send[0];
-									leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
-								}
+								nleaf_send_tmp = ++nleaf_send[0];
+								leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+							}
 
-								/*
-								 * Perform local block modifications.
-								 */
+							/*
+							 * Perform local block modifications.
+							 */
 
-								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
-								// #endif
-								{
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
+							{
 
-									dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
-											fmod, xsup,
-											grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
-								}
+								dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+										fmod, xsup,
+										grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
+							}
 
-								// } /* if frecv[lk] == 0 */
-						} /* if iam == p */
+							// } /* if frecv[lk] == 0 */
+						} /* end if iam == p */
 					} /* if fmod[lk] == 0 */
 				}
 
-			}
+			} /* end tasklook for nn ... */
 		}
 
 		}else{
@@ -781,16 +781,16 @@ void dlsum_fmod_inv
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
-					for (i = 0; i < nbrow1; ++i) {
-						irow = lsub[lptr+i] - rel; /* Relative row. */
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+				    for (i = 0; i < nbrow1; ++i) {
+					irow = lsub[lptr+i] - rel; /* Relative row. */
 
-								lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
-					}
+					lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+				    }
 				nbrow_ref+=nbrow1;
-			}
+			} /* end for lb ... */
 
 			// TOC(t3, t1);
 
@@ -801,94 +801,91 @@ void dlsum_fmod_inv
 
 			for (lb=0;lb<nlb;lb++){
 				lk = lloc[lb+idx_n];
-
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 				fmod_tmp=--fmod[lk*aln_i];
 
-
 				if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-					lptr1_tmp = lloc[lb+idx_i];
+				    lptr1_tmp = lloc[lb+idx_i];
 
-					ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-					lk = LBi( ik, grid ); /* Local block number, row-wise. */
+				    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				    lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-					iknsupc = SuperSize( ik );
-					il = LSUM_BLK( lk );
-					ikcol = PCOL( ik, grid );
-					p = PNUM( myrow, ikcol, grid );
-					if ( iam != p ) {
-						for (ii=1;ii<num_thread;ii++)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (jj=0;jj<iknsupc*nrhs;jj++)
-								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+				    iknsupc = SuperSize( ik );
+				    il = LSUM_BLK( lk );
+				    ikcol = PCOL( ik, grid );
+				    p = PNUM( myrow, ikcol, grid );
+				    if ( iam != p ) {
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				    	    for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-						nleaf_send_tmp = ++nleaf_send[0];
+					nleaf_send_tmp = ++nleaf_send[0];
 						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 
-					} else { /* Diagonal process: X[i] += lsum[i]. */
+				    } else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
-						TIC(t1);
+					TIC(t1);
 #endif
-						for (ii=1;ii<num_thread;ii++)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (jj=0;jj<iknsupc*nrhs;jj++)
-								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
-
-						ii = X_BLK( lk );
-						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i = 0; i < iknsupc; ++i)
-							    x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+					    for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
+					ii = X_BLK( lk );
+					RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				 	    for (i = 0; i < iknsupc; ++i)
+					        x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
 
-						lk = LBj( ik, grid );/* Local block number, column-wise. */
-						lsub1 = Llu->Lrowind_bc_ptr[lk];
-						lusup1 = Llu->Lnzval_bc_ptr[lk];
-						nsupr1 = lsub1[1];
+					lk = LBj( ik, grid );/* Local block number, column-wise. */
+					lsub1 = Llu->Lrowind_bc_ptr[lk];
+					lusup1 = Llu->Lnzval_bc_ptr[lk];
+					nsupr1 = lsub1[1];
 
-						if(Llu->inv == 1){
-							Linv = Llu->Linv_bc_ptr[lk];
+					if(Llu->inv == 1){
+					    Linv = Llu->Linv_bc_ptr[lk];
 #ifdef _CRAY
-							SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc );
+					    SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
 #elif defined (USE_VENDOR_BLAS)
-							dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+					    dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
 #else
-							dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc );
+					    dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i=0 ; i<iknsupc*nrhs ; i++){
-								x[ii+i] = rtemp_loc[i];
-							}
-						}else{
+					#ifdef _OPENMP
+					#pragma omp simd
+					#endif
+					    for (i=0 ; i<iknsupc*nrhs ; i++){
+						x[ii+i] = rtemp_loc[i];
+					    }
+					}else{
 #ifdef _CRAY
-							STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
-									lusup1, &nsupr1, &x[ii], &iknsupc);
+					    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+					    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-							dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+					    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
 						}
@@ -901,47 +898,46 @@ void dlsum_fmod_inv
 
 
 #if ( PROFlevel>=1 )
-						TOC(t2, t1);
-						stat[thread_id]->utime[SOL_TRSM] += t2;
+					TOC(t2, t1);
+					stat[thread_id]->utime[SOL_TRSM] += t2;
 #endif
 
-						stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
 
 #if ( DEBUGlevel>=2 )
-						printf("(%2d) Solve X[%2d]\n", iam, ik);
+					printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
 
-						/*
-						 * Send Xk to process column Pc[k].
-						 */
+					/*
+						* Send Xk to process column Pc[k].
+						*/
 
-						if(LBtree_ptr[lk].empty_==NO){
+					if(LBtree_ptr[lk].empty_==NO){
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-							nleaf_send_tmp = ++nleaf_send[0];
-							// printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp);
-							leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
-							// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d');
-						}
-
-						/*
-						 * Perform local block modifications.
-						 */
+						nleaf_send_tmp = ++nleaf_send[0];
+						// printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp);
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d');
+					}
 
-						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
-						// #endif
+					/*
+					 * Perform local block modifications.
+					 */
 
-						{
-							dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
-									fmod, xsup,
-									grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-						}
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
+// #endif
+					{
+						dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+							fmod, xsup,
+							grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
+					}
 
 						// } /* if frecv[lk] == 0 */
-				} /* if iam == p */
+				} /* end else iam == p */
 			} /* if fmod[lk] == 0 */
 		}
 		// }
@@ -950,7 +946,6 @@ void dlsum_fmod_inv
 	stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
 
 
-
 } /* if nlb>0*/
 } /* dLSUM_FMOD_INV */
 
@@ -1134,22 +1129,23 @@ void dlsum_fmod_inv_master
 						il = LSUM_BLK( lk );
 
 						RHS_ITERATE(j)
-							#ifdef _OPENMP
-								#pragma omp simd lastprivate(irow)
-							#endif
+					#ifdef _OPENMP
+					#pragma omp simd lastprivate(irow)
+					#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow];
 							}
 						nbrow_ref+=nbrow1;
-					}
+					} /* end for lb ... */
 
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
 #endif
-			}
-		}
+			} /* end if (lbstart<lbend) ... */
+
+		} /* end taskloop for nn = ... */
 
 		}else{
 
@@ -1190,22 +1186,22 @@ void dlsum_fmod_inv_master
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-						#pragma omp simd lastprivate(irow)
-					#endif
+			#ifdef _OPENMP
+			#pragma omp simd lastprivate(irow)
+			#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
-								lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+						lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
 					}
 				nbrow_ref+=nbrow1;
-			}
+			} /* end for lb ... */
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
 #endif
-		}
-			// TOC(t3, t1);
+		} /* end else ... */
+		// TOC(t3, t1);
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
 		for (lb=0;lb<nlb;lb++){
@@ -1240,9 +1236,9 @@ void dlsum_fmod_inv_master
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1253,27 +1249,24 @@ void dlsum_fmod_inv_master
 
 				} else { /* Diagonal process: X[i] += lsum[i]. */
 
-
-
 					// if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
-
 #if ( PROFlevel>=1 )
 					TIC(t1);
 #endif
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
 					ii = X_BLK( lk );
 					// for (jj=0;jj<num_thread;jj++)
 					RHS_ITERATE(j)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (i = 0; i < iknsupc; ++i)
 							x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
 
@@ -1298,9 +1291,9 @@ void dlsum_fmod_inv_master
 								&alpha, Linv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							x[ii+i] = rtemp_loc[i];
 						}
@@ -1320,7 +1313,6 @@ void dlsum_fmod_inv_master
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
-
 #endif
 
 					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
@@ -1341,13 +1333,12 @@ void dlsum_fmod_inv_master
 					 * Perform local block modifications.
 					 */
 
-					// #ifdef _OPENMP
-					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
-					// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
 					{
 						nlb1 = lsub1[0] - 1;
 
-
 						dlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
 								fmod, nlb1, xsup,
 								grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
@@ -1359,8 +1350,8 @@ void dlsum_fmod_inv_master
 		}
 		// }
 		stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
-	} /* if nlb>0*/
-} /* dLSUM_FMOD_INV */
+	} /* end if nlb>0*/
+} /* end dlsum_fmod_inv_master */
 
 
 
@@ -1420,7 +1411,7 @@ void dlsum_bmod_inv
 	float msg_vol = 0, msg_cnt = 0;
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
-	int_t dword = sizeof (double);
+	int_t dword = sizeof(double);
 	int_t aln_d,aln_i;
 	aln_d = 1;//ceil(CACHELINE/(double)dword);
 	aln_i = 1;//ceil(CACHELINE/(double)iword);
@@ -1484,15 +1475,15 @@ void dlsum_bmod_inv
 						fnz = usub[i + jj];
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+//#ifdef _OPENMP  
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 								stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
 
 						}
-					} /* for jj ... */
+					} /* end for jj ... */
 				}
 
 #if ( PROFlevel>=1 )
@@ -1500,7 +1491,6 @@ void dlsum_bmod_inv
 				stat[thread_id1]->utime[SOL_GEMM] += t2;
 #endif
 
-
 		#ifdef _OPENMP
 		#pragma omp atomic capture
 		#endif
@@ -1512,9 +1502,9 @@ void dlsum_bmod_inv
 					if ( iam != p ) {
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1534,12 +1524,11 @@ void dlsum_bmod_inv
 #if ( PROFlevel>=1 )
 						TIC(t1);
 #endif
-
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1547,9 +1536,9 @@ void dlsum_bmod_inv
 						dest = &x[ii];
 
 						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (i = 0; i < iknsupc; ++i)
 							    dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
 
@@ -1616,8 +1605,8 @@ void dlsum_bmod_inv
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-							nroot_send_tmp = ++nroot_send[0];
-							root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+							    nroot_send_tmp = ++nroot_send[0];
+							    root_send[(nroot_send_tmp-1)*aln_i] = lk1;
 							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'d');
 							}
 
@@ -1625,9 +1614,9 @@ void dlsum_bmod_inv
 							 * Perform local block modifications.
 							 */
 							if ( Urbs[lk1] ){
-								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-								// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 								{
 								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
@@ -1637,8 +1626,8 @@ void dlsum_bmod_inv
 						// } /* if brecv[ik] == 0 */
 					}
 				} /* if bmod[ik] == 0 */
-			}
-		}
+			} /* end for ub = ... */
+		} /* end for taskloop nn = ... */
 
 	} else {
 
@@ -1667,13 +1656,13 @@ void dlsum_bmod_inv
 					fnz = usub[i + jj];
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 
-								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
-								stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
+							dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+							stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
 					}
 				} /* for jj ... */
 			}
@@ -1694,9 +1683,9 @@ void dlsum_bmod_inv
 				if ( iam != p ) {
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 #ifdef _OPENMP
@@ -1718,9 +1707,9 @@ void dlsum_bmod_inv
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -1728,9 +1717,9 @@ void dlsum_bmod_inv
 					dest = &x[ii];
 
 					RHS_ITERATE(j)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (i = 0; i < iknsupc; ++i)
 							dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
 
@@ -1756,9 +1745,9 @@ void dlsum_bmod_inv
 									&alpha, Uinv, &iknsupc, &x[ii],
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
 	#endif
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								x[ii+i] = rtemp_loc[i];
 							}
@@ -1806,25 +1795,25 @@ void dlsum_bmod_inv
 						 */
 						if ( Urbs[lk1] )
 
-							// if(Urbs[lk1]>16){
-							// #ifdef _OPENMP
-							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-							// #endif
+// if(Urbs[lk1]>16){
+// #ifdef _OPENMP
+// #pragma omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 							// 	dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 									//	Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 									//	stat, root_send, nroot_send, sizelsum,sizertemp);
 							//}else{
-								dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
-										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+							dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+								stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
 							//}
 
 					// } /* if brecv[ik] == 0 */
 				}
 			} /* if bmod[ik] == 0 */
 
-		} /* for ub ... */
-	}
+		} /* end for ub ... */
+	} /* end else ... */
 
 } /* dlSUM_BMOD_inv */
 
@@ -1951,9 +1940,9 @@ void dlsum_bmod_inv_master
 						fnz = usub[i + jj];
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 							stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
@@ -1993,9 +1982,9 @@ void dlsum_bmod_inv_master
 					fnz = usub[i + jj];
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 							dest[irow - ikfrow] -= uval[uptr++] * y[jj];
 						stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
@@ -2029,9 +2018,9 @@ void dlsum_bmod_inv_master
 			if ( iam != p ) {
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 				// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[ik],'d')*nrhs+LSUM_H,'d');
@@ -2048,9 +2037,9 @@ void dlsum_bmod_inv_master
 #endif
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
 
@@ -2058,9 +2047,9 @@ void dlsum_bmod_inv_master
 				dest = &x[ii];
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (i = 0; i < iknsupc; ++i)
 						dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
 
@@ -2086,9 +2075,9 @@ void dlsum_bmod_inv_master
 								&alpha, Uinv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							x[ii+i] = rtemp_loc[i];
 						}
@@ -2131,9 +2120,9 @@ void dlsum_bmod_inv_master
 					 * Perform local block modifications.
 					 */
 					if ( Urbs[lk1] ){
-						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-						// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 						{
 						dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index 8b7c61f2..ae752e52 100644
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -873,27 +873,53 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
  */
 void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct)
 {
-    int_t *it;
-
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-
-    if ( options->RefineInitialized ) {
-        pdgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
+    if ( options->SolveInitialized ) {
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+        if ( options->RefineInitialized ) {
+            pdgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
     }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
-    options->SolveInitialized = NO;
 } /* dSolveFinalize */
 
+void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->row_counts_int);  // free displacements and counts 
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(A3d->nnz_counts_int);
+    SUPERLU_FREE(A3d->nnz_disp);
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* dDestroy_A3d_gathered_on_2d */
+
+
 /*! \brief Check the inf-norm of the error vector
  */
 void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
-		      double xtrue[], int_t ldxtrue, gridinfo_t *grid)
+		      double xtrue[], int_t ldxtrue, MPI_Comm slucomm)
 {
     double err, xnorm, temperr, tempxnorm;
     double *x_work, *xtrue_work;
@@ -911,8 +937,8 @@ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
       /* get the golbal max err & xnrom */
       temperr = err;
       tempxnorm = xnorm;
-      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm);
 
       err = err / xnorm;
       if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
diff --git a/SRC/ps3dcomm.c b/SRC/ps3dcomm.c
new file mode 100644
index 00000000..2c113562
--- /dev/null
+++ b/SRC/ps3dcomm.c
@@ -0,0 +1,876 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(float), MPI_INFO_NULL, &(a)))
+
+int_t sAllocLlu(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    float  **Lnzval_bc_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    float  **Unzval_br_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* sAllocLlu */
+
+int_t smpiMallocLUStruct(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    float * uval = Unzval_br_ptr[lb];
+	    float * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(float));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    float *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(float));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t szSendLPanel(int_t k, int_t receiver,
+                   sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(float);
+		}
+	}
+    return 0;
+}
+
+
+int_t szRecvLPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Lval_buf,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(len2, alpha, lnzval, 1);
+		    superlu_saxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t szSendUPanel(int_t k, int_t receiver,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(float);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t szRecvUPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Uval_buf, sLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(lenv, alpha, unzval, 1);
+		    superlu_saxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t sp3dScatter(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocLlu(nsupers, LUstruct, grid3d);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    sscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    sscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* sp3dScatter */
+
+
+int_t sscatter3dUPanels(int_t nsupers,
+		       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	float * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = floatMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, MPI_FLOAT, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end sScatter3dUPanels */
+
+
+int_t sscatter3dLPanels(int_t nsupers,
+                       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = floatCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, MPI_FLOAT, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* sscatter3dLPanels */
+
+int_t scollect3dLpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, MPI_FLOAT, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t scollect3dUpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	float * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, MPI_FLOAT, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, MPI_FLOAT, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t sp3dCollect(int_t layer, int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    scollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    scollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t szeroSetLU(int_t nnodes, int_t* nodeList, sLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    float* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(float) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    float* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(float) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t sreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        float* Lval_buf, float* Uval_buf,
+                        sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 1.0, beta = 1.0;	
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t sgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        sLUValSubBuf_t* LUvsb,
+                        sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 0.0, beta = 1.0;	
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t sinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, sLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    szeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int sreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             sLUValSubBuf_t* LUvsb, sLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    sreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t sgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    sLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    sgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* sgatherAllFactoredLU */
+
diff --git a/SRC/psGetDiagU.c b/SRC/psGetDiagU.c
new file mode 100644
index 00000000..9303baa9
--- /dev/null
+++ b/SRC/psGetDiagU.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file p@(pre)GetDiagU.c
+ * \brief Extracts the main diagonal of matrix U
+ *
+ * <pre>
+ * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Xiaoye S. Li
+ * Created:  April 16, 2002
+ * Modified: May 15, 2016
+ * </pre>
+ */
+
+
+
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
+ *
+ * Arguments
+ * =========
+ *
+ * n        (input) int
+ *          Dimension of the matrix.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *          The data structures to store the distributed L and U factors.
+ *          see superlu_ddefs.h for its definition.
+ *
+ * grid     (input) gridinfo_t*
+ *          The 2D process mesh. It contains the MPI communicator, the number
+ *          of process rows (NPROW), the number of process columns (NPCOL),
+ *          and my process rank. It is an input argument to all the
+ *          parallel routines.
+ *
+ * diagU    (output) double*, dimension (n)
+ *          The main diagonal of matrix U.
+ *          On exit, it is available on all processes.
+ *
+ *
+ * Note
+ * ====
+ *
+ * The diagonal blocks of the L and U matrices are stored in the L
+ * data structures, and are on the diagonal processes of the
+ * 2D process grid.
+ *
+ * This routine is modified from gather_diag_to_all() in psgstrs_Bglobal.c.
+ * </pre>
+ */
+void psGetDiagU(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
+                  float *diagU)
+{
+
+    int_t *xsup;
+    int iam, knsupc, pkk;
+    int nsupr; /* number of rows in the block L(:,k) (LDA) */
+    int_t i, j, jj, k, lk, lwork, nsupers, p;
+    int_t num_diag_procs, *diag_procs, *diag_len;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    float *sblock, *swork, *lusup;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    if ( !(swork = floatMalloc_dist(jj)) ) ABORT("Malloc fails for swork[]");
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy diagonal into buffer dwork[]. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBj( k, grid );
+		nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+		lusup = Llu->Lnzval_bc_ptr[lk];
+		for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
+		    swork[lwork+i] = lusup[i*(nsupr+1)];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( swork, lwork, MPI_FLOAT, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( swork, diag_len[p], MPI_FLOAT, pkk, grid->comm );
+	}
+
+	/* Scatter swork[] into global diagU vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    sblock = &diagU[FstBlockC( k )];
+	    for (i = 0; i < knsupc; ++i) sblock[i] = swork[lwork+i];
+	    lwork += knsupc;
+	}
+    } /* for p = ... */
+
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    SUPERLU_FREE(swork);
+}
diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c
new file mode 100644
index 00000000..30c114cb
--- /dev/null
+++ b/SRC/psdistribute.c
@@ -0,0 +1,1988 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Re-distribute A on the 2D process mesh.
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * colptr (output) int*
+ *
+ * rowind (output) int*
+ *
+ * a      (output) float*
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * </pre>
+ */
+int_t
+sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct,
+                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
+                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
+                float *a[])
+{
+    NRformat_loc *Astore;
+    int_t  *perm_r; /* row permutation vector */
+    int_t  *perm_c; /* column permutation vector */
+    int_t  i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize,nnz_tot;
+    int_t  nnz_loc;    /* number of local nonzeros */
+    int_t  SendCnt; /* number of remote nonzeros to be sent */
+    int_t  RecvCnt; /* number of remote nonzeros to be sent */
+    int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+    int_t  *ia, *ja, **ia_send, *index, *itemp = NULL;
+    int_t  *ptr_to_send;
+    float *aij, **aij_send, *nzval, *dtemp = NULL;
+    float *nzval_a;
+	float asum,asum_tot;
+    int    iam, it, p, procs, iam_g;
+    MPI_Request *send_req;
+    MPI_Status  status;
+
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter sReDistribute_A()");
+#endif
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    nnzToRecv = intCalloc_dist(2*procs);
+    nnzToSend = nnzToRecv + procs;
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+       THEN ALLOCATE SPACE.
+       THIS ACCOUNTS FOR THE FIRST PASS OF A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+	    ++nnzToSend[p];
+	}
+    }
+
+    /* All-to-all communication */
+    MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		  grid->comm);
+
+    maxnnzToRecv = 0;
+    nnz_loc = SendCnt = RecvCnt = 0;
+
+    for (p = 0; p < procs; ++p) {
+	if ( p != iam ) {
+	    SendCnt += nnzToSend[p];
+	    RecvCnt += nnzToRecv[p];
+	    maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+	} else {
+	    nnz_loc += nnzToRecv[p];
+	    /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+	}
+    }
+    k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+
+    /* Allocate space for storing the triplets after redistribution. */
+    if ( k ) { /* count can be zero. */
+        if ( !(ia = intMalloc_dist(2*k)) )
+            ABORT("Malloc fails for ia[].");
+        if ( !(aij = floatMalloc_dist(k)) )
+            ABORT("Malloc fails for aij[].");
+        ja = ia + k;
+    }
+
+    /* Allocate temporary storage for sending/receiving the A triplets. */
+    if ( procs > 1 ) {
+      if ( !(send_req = (MPI_Request *)
+	     SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+      if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
+        ABORT("Malloc fails for ia_send[].");
+      if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) )
+        ABORT("Malloc fails for aij_send[].");
+      if ( SendCnt ) { /* count can be zero */
+          if ( !(index = intMalloc_dist(2*SendCnt)) )
+              ABORT("Malloc fails for index[].");
+          if ( !(nzval = floatMalloc_dist(SendCnt)) )
+              ABORT("Malloc fails for nzval[].");
+      }
+      if ( !(ptr_to_send = intCalloc_dist(procs)) )
+        ABORT("Malloc fails for ptr_to_send[].");
+      if ( maxnnzToRecv ) { /* count can be zero */
+          if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
+              ABORT("Malloc fails for itemp[].");
+          if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) )
+              ABORT("Malloc fails for dtemp[].");
+      }
+
+      for (i = 0, j = 0, p = 0; p < procs; ++p) {
+          if ( p != iam ) {
+	      if (nnzToSend[p] > 0) ia_send[p] = &index[i];
+	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	      if (nnzToSend[p] > 0) aij_send[p] = &nzval[j];
+	      j += nnzToSend[p];
+	  }
+      }
+    } /* if procs > 1 */
+
+    if ( !(*colptr = intCalloc_dist(n+1)) )
+        ABORT("Malloc fails for *colptr[].");
+
+    /* ------------------------------------------------------------
+       LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+       THIS ACCOUNTS FOR THE SECOND PASS OF A.
+       ------------------------------------------------------------*/
+    nnz_loc = 0; /* Reset the local nonzero count. */
+    nzval_a = Astore->nzval;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+  	    irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+	    jcol = Astore->colind[j];
+	    gbi = BlockNum( irow );
+	    gbj = BlockNum( jcol );
+	    p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+
+	    if ( p != iam ) { /* remote */
+	        k = ptr_to_send[p];
+	        ia_send[p][k] = irow;
+	        ia_send[p][k + nnzToSend[p]] = jcol;
+		aij_send[p][k] = nzval_a[j];
+		++ptr_to_send[p];
+	    } else {          /* local */
+	        ia[nnz_loc] = irow;
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = nzval_a[j];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       NOTE: Can possibly use MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam && nnzToSend[p] > 0 ) {
+    	//if ( p != iam ) {
+	    it = 2*nnzToSend[p];
+	    MPI_Isend( ia_send[p], it, mpi_int_t,
+		       p, iam, grid->comm, &send_req[p] );
+	    it = nnzToSend[p];
+	    MPI_Isend( aij_send[p], it, MPI_FLOAT,
+	               p, iam+procs, grid->comm, &send_req[procs+p] );
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam && nnzToRecv[p] > 0 ) {
+	//if ( p != iam ) {
+	    it = 2*nnzToRecv[p];
+	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
+	    it = nnzToRecv[p];
+            MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs,
+		      grid->comm, &status );
+	    for (i = 0; i < nnzToRecv[p]; ++i) {
+	        ia[nnz_loc] = itemp[i];
+		jcol = itemp[i + nnzToRecv[p]];
+		/*assert(jcol<n);*/
+	        ja[nnz_loc] = jcol;
+		aij[nnz_loc] = dtemp[i];
+		++nnz_loc;
+		++(*colptr)[jcol]; /* Count nonzeros in each column */
+	    }
+	}
+    }
+
+    for (p = 0; p < procs; ++p) {
+        if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang
+        //if ( p != iam ) {
+	    MPI_Wait( &send_req[p], &status);
+	    MPI_Wait( &send_req[procs+p], &status);
+	}
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE
+       ------------------------------------------------------------*/
+
+    SUPERLU_FREE(nnzToRecv);
+
+    if ( procs > 1 ) {
+	SUPERLU_FREE(send_req);
+	SUPERLU_FREE(ia_send);
+	SUPERLU_FREE(aij_send);
+	if ( SendCnt ) {
+            SUPERLU_FREE(index);
+            SUPERLU_FREE(nzval);
+        }
+	SUPERLU_FREE(ptr_to_send);
+        if ( maxnnzToRecv ) {
+            SUPERLU_FREE(itemp);
+            SUPERLU_FREE(dtemp);
+        }
+    }
+
+    /* ------------------------------------------------------------
+       CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
+       ------------------------------------------------------------*/
+    if ( nnz_loc ) { /* nnz_loc can be zero */
+        if ( !(*rowind = intMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *rowind[].");
+        if ( !(*a = floatMalloc_dist(nnz_loc)) )
+            ABORT("Malloc fails for *a[].");
+    }
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = (*colptr)[0];
+    (*colptr)[0] = 0;
+    for (j = 1; j < n; ++j) {
+	k += jsize;
+	jsize = (*colptr)[j];
+	(*colptr)[j] = k;
+    }
+
+    /* Copy the triplets into the column oriented storage */
+    for (i = 0; i < nnz_loc; ++i) {
+	j = ja[i];
+	k = (*colptr)[j];
+	(*rowind)[k] = ia[i];
+	(*a)[k] = aij[i];
+	++(*colptr)[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
+    (*colptr)[0] = 0;
+
+    if ( nnz_loc ) {
+        SUPERLU_FREE(ia);
+        SUPERLU_FREE(aij);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit sReDistribute_A()");
+#endif
+
+    return 0;
+} /* sReDistribute_A */
+
+float
+psdistribute(fact_t fact, int_t n, SuperMatrix *A,
+	     sScalePermstruct_t *ScalePermstruct,
+	     Glu_freeable_t *Glu_freeable, sLUstruct_t *LUstruct,
+	     gridinfo_t *grid)
+/*
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ *
+ */
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
+          len, len1, nsupc;
+	int_t lib;  /* local block row number */
+	int_t nlb;  /* local block rows*/
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+	int_t ub,gik,iklrow,fnz;
+	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NRformat_loc *Astore;
+    float *a;
+    int_t *asub, *xa;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;
+    int_t *lsub, *xlsub, *usub, *usub1, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
+	int   *index1;        /* temporary pointer to array of int */
+    float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
+    float **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
+	float **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+
+	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	int msgsize;
+
+    int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+	int_t *ActiveFlag;
+	int_t *ActiveFlagAll;
+	int_t Iactive;
+	int *ranks;
+	int_t *idxs;
+	int_t **nzrows;
+	double rseed;
+	int rank_cnt,rank_cnt_ref,Root;
+	float *dense, *dense_col; /* SPA */
+    float zero = 0.0;
+    int_t ldaspa;     /* LDA of SPA */
+    int_t iword, dword;
+    float mem_use = 0.0;
+    float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
+
+    int_t *mod_bit;
+    int_t *frecv, *brecv, *lloc;
+    float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    float **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    double *SeedSTD_BC,*SeedSTD_RD;
+    int_t idx_indx,idx_lusup;
+    int_t nbrow;
+    int_t  ik, il, lk, rel, knsupc, idx_r;
+    int_t  lptr1_tmp, idx_i, idx_v,m, uu;
+    int_t nub;
+    int tag;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = (NRformat_loc *) A->Store;
+
+//#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    dword = sizeof(float);
+//#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psdistribute()");
+#endif
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_();
+#endif
+
+    sReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
+		      grid, &xa, &asub, &a);
+
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf("--------\n"
+		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+	Unnz = Llu->Unnz;
+
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else { /* fact is not SamePattern_SameRowPerm */
+        /* ------------------------------------------------------------
+	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+	   ------------------------------------------------------------*/
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We first need to set up the L and U data structures and then
+	 * propagate the values of A into them.
+	 */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+
+	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr =
+              (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+
+	mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword;
+
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+
+        mem_use -= 2.0*k * iword;
+
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+
+	/* ------------------------------------------------ */
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr =
+              (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	if ( !(Lindval_loc_bc_ptr =
+				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
+	Lindval_loc_bc_ptr[k-1] = NULL;
+
+	if ( !(Linv_bc_ptr =
+				(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
+	}
+	if ( !(Uinv_bc_ptr =
+				(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
+	}
+	Linv_bc_ptr[k-1] = NULL;
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
+			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
+	ABORT("Malloc fails for Unnz[].");
+
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+	/* -------------------------------------------------------------- */
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		} /* for j ... */
+
+		jbrow = PROW( jb, grid );
+
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+		    if ( !(index = intMalloc_dist(len1)) )
+			ABORT("Malloc fails for index[]");
+		    if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+			ABORT("Malloc fails for lusup[]");
+		    if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
+			ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
+  		    if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+			ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+		    if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+			ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+	  	    memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lindval_loc_bc_ptr[ljb][k] = lb;
+			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len;
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[],
+                       and the initial values of A from SPA into Lnzval[]. */
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = 0.0;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+
+		    Lrowind_bc_ptr[ljb] = index;
+		    Lnzval_bc_ptr[ljb] = lusup;
+
+			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb]
+                           and Lnzval_bc_ptr[ljb] here.  */
+			if(nrbl>1){
+				krow = PROW( jb, grid );
+				if(myrow==krow){ /* skip the diagonal block */
+					uu=nrbl-2;
+					lloc = &Lindval_loc_bc_ptr[ljb][1];
+				}else{
+					uu=nrbl-1;
+					lloc = Lindval_loc_bc_ptr[ljb];
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
+			}
+
+
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
+			if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+				ABORT("Malloc fails for lusup_srt[]");
+
+			idx_indx = BC_HEADER;
+			idx_lusup = 0;
+			for (jj=0;jj<BC_HEADER;jj++)
+				index_srt[jj] = index[jj];
+
+			for(i=0;i<nrbl;i++){
+				nbrow = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+1];
+				for (jj=0;jj<LB_DESCRIPTOR+nbrow;jj++){
+					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
+				}
+
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
+
+				for (jj=0;jj<nbrow;jj++){
+					k=idx_lusup;
+					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
+					for (j = 0; j < nsupc; ++j) {
+						lusup_srt[k] = lusup[k1];
+						k += len;
+						k1 += len;
+					}
+					idx_lusup++;
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
+			}
+
+			SUPERLU_FREE(lusup);
+			SUPERLU_FREE(index);
+
+			Lrowind_bc_ptr[ljb] = index_srt;
+			Lnzval_bc_ptr[ljb] = lusup_srt;
+
+			// if(ljb==0)
+			// for (jj=0;jj<nrbl*3;jj++){
+			// printf("iam %5d Lindval %5d\n",iam, Lindval_loc_bc_ptr[ljb][jj]);
+			// fflush(stdout);
+			// }
+			// for (jj=0;jj<nrbl;jj++){
+			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
+			// fflush(stdout);
+
+			// }
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+			Linv_bc_ptr[ljb] = NULL;
+			Uinv_bc_ptr[ljb] = NULL;
+			Lindval_loc_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	/////////////////////////////////////////////////////////////////
+
+	/* Set up additional pointers for the index and value arrays of U.
+	   nub is the number of local block columns. */
+	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
+	if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+		ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+							 blocks in a block column. */
+	Urbs1 = Urbs + nub;
+	if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+		ABORT("Malloc fails for Ucb_indptr[]");
+	if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+		ABORT("Malloc fails for Ucb_valptr[]");
+	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
+
+	/* Count number of row blocks in a block column.
+	   One pass of the skeleton graph of U. */
+	for (lk = 0; lk < nlb; ++lk) {
+		usub1 = Ufstnz_br_ptr[lk];
+		if ( usub1 ) { /* Not an empty block row. */
+			/* usub1[0] -- number of column blocks in this block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];            /* Global block number */
+				++Urbs[LBj(k,grid)];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+	/* Set up the vertical linked lists for the row blocks.
+	   One pass of the skeleton graph of U. */
+	for (lb = 0; lb < nub; ++lb) {
+		if ( Urbs[lb] ) { /* Not an empty block column. */
+			if ( !(Ucb_indptr[lb]
+						= SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+				ABORT("Malloc fails for Ucb_indptr[lb][]");
+			if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+				ABORT("Malloc fails for Ucb_valptr[lb][]");
+		}
+	}
+	for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+		usub1 = Ufstnz_br_ptr[lk];
+		if ( usub1 ) { /* Not an empty block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			j = 0;         /* Pointer in nzval array. */
+
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];          /* Global block number, column-wise. */
+				ljb = LBj( k, grid ); /* Local block number, column-wise. */
+				Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+
+				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+				Ucb_valptr[ljb][Urbs1[ljb]] = j;
+
+				++Urbs1[ljb];
+				j += usub1[i+1];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+
+/* Count the nnzs per block column */
+	for (lb = 0; lb < nub; ++lb) {
+		Unnz[lb] = 0;
+		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
+		knsupc = SuperSize( k );
+		for (ub = 0; ub < Urbs[lb]; ++ub) {
+			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
+			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iklrow = FstBlockC( gik+1 );
+			for (jj = 0; jj < knsupc; ++jj) {
+				fnz = Ufstnz_br_ptr[ik][i + jj];
+				if ( fnz < iklrow ) {
+					Unnz[lb] +=iklrow-fnz;
+				}
+			} /* for jj ... */
+		}
+	}
+
+	/////////////////////////////////////////////////////////////////
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Bcast tree for L ... */
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		ABORT("Malloc fails for LBtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
+
+	for (i=0;i<k;i++){
+		SeedSTD_BC[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+	for (ljb = 0; ljb <k ; ++ljb) {
+		LBtree_ptr[ljb]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
+		} /* for j ... */
+		}
+	}
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+		Root=-1;
+		Iactive = 0;
+		for (j=0;j<grid->nprow;++j){
+			if(ActiveFlag[j]!=3*nsupers){
+			gb = ActiveFlag[j];
+			pr = PROW( gb, grid );
+			if(gb==jb)Root=pr;
+			if(myrow==pr)Iactive=1;
+			}
+		}
+
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
+
+		if(Iactive==1){
+			// printf("jb %5d damn\n",jb);
+			// fflush(stdout);
+			assert( Root>-1 );
+			rank_cnt = 1;
+			ranks[0]=Root;
+			for (j = 0; j < grid->nprow; ++j){
+				if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+					++rank_cnt;
+				}
+			}
+
+			if(rank_cnt>1){
+
+				for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+					ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+				// rseed=rand();
+				// rseed=1.0;
+				msgsize = SuperSize( jb );
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+
+				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+				// fflush(stdout);
+
+				// if(iam==15 || iam==3){
+				// printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s'));
+				// fflush(stdout);
+				// }
+
+				// #if ( PRNTlevel>=1 )
+				if(Root==myrow){
+					rank_cnt_ref=1;
+					for (j = 0; j < grid->nprow; ++j) {
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
+						}
+					}
+					assert(rank_cnt==rank_cnt_ref);
+
+					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
+
+					// // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt);
+					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// // printf("\n");
+				}
+				// #endif
+			}
+		}
+		}
+	}
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
+#endif
+
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Reduce tree for L ... */
+	/* the following is used as reference */
+	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(mod_bit = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for mod_bit[].");
+	if ( !(frecv = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for frecv[].");
+
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+		pr = PROW( k, grid );
+		if ( myrow == pr ) {
+			lib = LBi( k, grid );    /* local block number */
+			kcol = PCOL( k, grid );
+			if (mycol == kcol || fmod[lib] )
+				mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+		}
+	}
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		ABORT("Malloc fails for LRtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+
+	// if ( !(idxs = intCalloc_dist(nsupers)) )
+		// ABORT("Calloc fails for idxs[].");
+
+	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+		// ABORT("Malloc fails for nzrows[].");
+
+	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_RD[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_RD[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+
+	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		// fsupc = FstBlockC( jb );
+		// len=xlsub[fsupc+1]-xlsub[fsupc];
+		// idxs[jb] = len-1;
+		// if(len>0){
+			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
+				// ABORT("Malloc fails for nzrows[jb]");
+			// for(i=xlsub[fsupc];i<xlsub[fsupc+1];++i){
+				// irow = lsub[i];
+				// nzrows[jb][i-xlsub[fsupc]]=irow;
+			// }
+			// quickSort(nzrows[jb],0,len-1,0);
+		// }
+		// else{
+			// nzrows[jb] = NULL;
+		// }
+	// }
+
+
+	for (lib = 0; lib <k ; ++lib) {
+		LRtree_ptr[lib]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		fsupc = FstBlockC( jb );
+		pc = PCOL( jb, grid );
+		for(i=xlsub[fsupc];i<xlsub[fsupc+1];++i){
+			irow = lsub[i];
+			ib = BlockNum( irow );
+			pr = PROW( ib, grid );
+			if ( myrow == pr ) { /* Block row ib in my process row */
+				lib = LBi( ib, grid ); /* Local block number */
+				ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb);
+			}
+		}
+	}
+
+
+	for (lib=0;lib<k;++lib){
+		ib = myrow+lib*grid->nprow;  /* not sure */
+		if(ib<nsupers){
+			pr = PROW( ib, grid );
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+			Root=-1;
+			Iactive = 0;
+
+			for (j=0;j<grid->npcol;++j){
+				if(ActiveFlag[j]!=-3*nsupers){
+				jb = ActiveFlag[j];
+				pc = PCOL( jb, grid );
+				if(jb==ib)Root=pc;
+				if(mycol==pc)Iactive=1;
+				}
+			}
+
+
+			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
+
+			if(Iactive==1){
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->npcol; ++j){
+					if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+						++rank_cnt;
+					}
+				}
+				if(rank_cnt>1){
+
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( ib );
+
+					// if(ib==0){
+
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					// }
+
+					// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
+					// fflush(stdout);
+
+					// if(ib==15  || ib ==16){
+
+					// if(iam==15 || iam==3){
+					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s'));
+					// fflush(stdout);
+					// }
+
+
+					// #if ( PRNTlevel>=1 )
+					// if(Root==mycol){
+					// assert(rank_cnt==frecv[lib]);
+					// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+					// // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// // printf("\n");
+					// }
+					// #endif
+				}
+			}
+		}
+	}
+
+	SUPERLU_FREE(mod_bit);
+	SUPERLU_FREE(frecv);
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
+	// for(i=0;i<nsupers;++i){
+		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+	// }
+	// SUPERLU_FREE(nzrows);
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+		////////////////////////////////////////////////////////
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* construct the Bcast tree for U ... */
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		ABORT("Malloc fails for UBtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_BC[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+
+	for (ljb = 0; ljb <k ; ++ljb) {
+		UBtree_ptr[ljb]=NULL;
+	}
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+
+		fsupc = FstBlockC( jb );
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			istart = xusub[j];
+			/* NOTE: Only the first nonzero index of the segment
+			   is stored in usub[]. */
+			for (i = istart; i < xusub[j+1]; ++i) {
+				irow = usub[i]; /* First nonzero in the segment. */
+				gb = BlockNum( irow );
+				pr = PROW( gb, grid );
+				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
+			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
+			// fflush(stdout);
+				//if(gb==jb)Root=pr;
+			}
+
+
+		}
+		pr = PROW( jb, grid ); // take care of diagonal node stored as L
+		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
+		// fflush(stdout);
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
+		}
+	}
+
+
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+		// if ( mycol == pc ) { /* Block column jb in my process column */
+
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+		Root=-1;
+		Iactive = 0;
+		for (j=0;j<grid->nprow;++j){
+			if(ActiveFlag[j]!=-3*nsupers){
+			gb = ActiveFlag[j];
+			pr = PROW( gb, grid );
+			if(gb==jb)Root=pr;
+			if(myrow==pr)Iactive=1;
+			}
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
+	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
+	// fflush(stdout);
+		if(Iactive==1){
+			// printf("root:%5d jb: %5d\n",Root,jb);
+			// fflush(stdout);
+			assert( Root>-1 );
+			rank_cnt = 1;
+			ranks[0]=Root;
+			for (j = 0; j < grid->nprow; ++j){
+				if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+					++rank_cnt;
+				}
+			}
+	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
+	// fflush(stdout);
+			if(rank_cnt>1){
+				for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+					ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+				// rseed=rand();
+				// rseed=1.0;
+				msgsize = SuperSize( jb );
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+
+				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+				// fflush(stdout);
+
+				if(Root==myrow){
+				rank_cnt_ref=1;
+				for (j = 0; j < grid->nprow; ++j) {
+					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
+					// fflush(stdout);
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
+					}
+				}
+				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
+			}
+		}
+		}
+	}
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Reduce tree for U ... */
+	/* the following is used as reference */
+	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(mod_bit = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for mod_bit[].");
+	if ( !(brecv = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for brecv[].");
+
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+		pr = PROW( k, grid );
+		if ( myrow == pr ) {
+			lib = LBi( k, grid );    /* local block number */
+			kcol = PCOL( k, grid );
+			if (mycol == kcol || bmod[lib] )
+				mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+		}
+	}
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		ABORT("Malloc fails for URtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+
+	// if ( !(idxs = intCalloc_dist(nsupers)) )
+		// ABORT("Calloc fails for idxs[].");
+
+	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+		// ABORT("Malloc fails for nzrows[].");
+
+	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_RD[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_RD[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+
+	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		// fsupc = FstBlockC( jb );
+		// len=0;
+		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			// istart = xusub[j];
+			// /* NOTE: Only the first nonzero index of the segment
+			   // is stored in usub[]. */
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
+		// idxs[jb] = len-1;
+
+		// if(len>0){
+			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
+				// ABORT("Malloc fails for nzrows[jb]");
+
+			// fsupc = FstBlockC( jb );
+
+			// len=0;
+
+			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+				// istart = xusub[j];
+				// /* NOTE: Only the first nonzero index of the segment
+				   // is stored in usub[]. */
+				// for (i = istart; i < xusub[j+1]; ++i) {
+					// irow = usub[i]; /* First nonzero in the segment. */
+					// nzrows[jb][len]=irow;
+					// len++;
+				// }
+			// }
+			// quickSort(nzrows[jb],0,len-1,0);
+		// }
+		// else{
+			// nzrows[jb] = NULL;
+		// }
+	// }
+
+
+	for (lib = 0; lib <k ; ++lib) {
+		URtree_ptr[lib]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		fsupc = FstBlockC( jb );
+		pc = PCOL( jb, grid );
+
+		fsupc = FstBlockC( jb );
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			istart = xusub[j];
+			/* NOTE: Only the first nonzero index of the segment
+			   is stored in usub[]. */
+			for (i = istart; i < xusub[j+1]; ++i) {
+				irow = usub[i]; /* First nonzero in the segment. */
+				ib = BlockNum( irow );
+				pr = PROW( ib, grid );
+				if ( myrow == pr ) { /* Block row ib in my process row */
+					lib = LBi( ib, grid ); /* Local block number */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+				}
+			}
+		}
+
+		pr = PROW( jb, grid );
+		if ( myrow == pr ) { /* Block row ib in my process row */
+			lib = LBi( jb, grid ); /* Local block number */
+			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+		}
+	}
+
+
+	for (lib=0;lib<k;++lib){
+		ib = myrow+lib*grid->nprow;  /* not sure */
+		if(ib<nsupers){
+			pr = PROW( ib, grid );
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+			Root=-1;
+			Iactive = 0;
+
+			for (j=0;j<grid->npcol;++j){
+				if(ActiveFlag[j]!=3*nsupers){
+				jb = ActiveFlag[j];
+				pc = PCOL( jb, grid );
+				if(jb==ib)Root=pc;
+				if(mycol==pc)Iactive=1;
+				}
+			}
+
+			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
+
+			if(Iactive==1){
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->npcol; ++j){
+					if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+						++rank_cnt;
+					}
+				}
+				if(rank_cnt>1){
+
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( ib );
+
+					// if(ib==0){
+
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					// }
+
+					// #if ( PRNTlevel>=1 )
+					if(Root==mycol){
+					// printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]);
+					// fflush(stdout);
+					assert(rank_cnt==brecv[lib]);
+					// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+					// printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// printf("\n");
+					}
+					// #endif
+				}
+			}
+		}
+	}
+	SUPERLU_FREE(mod_bit);
+	SUPERLU_FREE(brecv);
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
+	// for(i=0;i<nsupers;++i){
+		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+	// }
+	// SUPERLU_FREE(nzrows);
+
+	memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
+#endif
+
+	////////////////////////////////////////////////////////
+
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->Unnz = Unnz;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+
+	Llu->LRtree_ptr = LRtree_ptr;
+	Llu->LBtree_ptr = LBtree_ptr;
+	Llu->URtree_ptr = URtree_ptr;
+	Llu->UBtree_ptr = UBtree_ptr;
+	Llu->Linv_bc_ptr = Linv_bc_ptr;
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
+
+
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
+		      MPI_MAX, grid->comm);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
+        SUPERLU_FREE(asub);
+        SUPERLU_FREE(a);
+    }
+    SUPERLU_FREE(xa);
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit psdistribute()");
+#endif
+
+    return (mem_use+memTRS);
+
+} /* PSDISTRIBUTE */
diff --git a/SRC/psgsequ.c b/SRC/psgsequ.c
new file mode 100644
index 00000000..02bd4a0d
--- /dev/null
+++ b/SRC/psgsequ.c
@@ -0,0 +1,244 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Computes row and column scalings
+ *
+ * File name:	psgsequ.c
+ * History:     Modified from LAPACK routine SGEEQU
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+
+ <pre>
+    Purpose
+    =======
+
+    PSGSEQU computes row and column scalings intended to equilibrate an
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make
+    the largest element in each row and column of the matrix B with
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+    number and BIGNUM = largest safe number.  Use of these scaling
+    factors is not guaranteed to reduce the condition number of A but
+    works well in practice.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (output) float*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors
+            for A.
+
+    C       (output) float*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+
+    ROWCND  (output) float*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and
+            AMAX is neither too large nor too small, it is not worth
+            scaling by R.
+
+    COLCND  (output) float*
+            If INFO = 0, COLCND contains the ratio of the smallest
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not
+            worth scaling by C.
+
+    AMAX    (output) float*
+            Absolute value of largest matrix element.  If AMAX is very
+            close to overflow or very close to underflow, the matrix
+            should be scaled.
+
+    INFO    (output) int*
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i,  and i is
+                  <= M:  the i-th row of A is exactly zero
+                  >  M:  the (i-M)-th column of A is exactly zero
+
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+    =====================================================================
+</pre>
+*/
+
+void
+psgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd,
+	float *colcnd, float *amax, int_t *info, gridinfo_t *grid)
+{
+
+    /* Local variables */
+    NRformat_loc *Astore;
+    float *Aval;
+    int i, j, irow, jcol, m_loc;
+    float rcmin, rcmax;
+    float bignum, smlnum;
+    float tempmax, tempmin;
+    float *loc_max;
+    int *r_sizes, *displs;
+    float *loc_r;
+    int_t  procs;
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NR_loc || A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("psgsequ", grid, i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+
+    /* Get machine constants. */
+    smlnum = smach_dist("S");
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
+	++irow;
+    }
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+
+    /* Get the global MAX and MIN for R */
+    tempmax = rcmax;
+    tempmin = rcmin;
+    MPI_Allreduce( &tempmax, &rcmax,
+		1, MPI_FLOAT, MPI_MAX, grid->comm);
+    MPI_Allreduce( &tempmin, &rcmin,
+		1, MPI_FLOAT, MPI_MIN, grid->comm);
+
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    irow = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	    jcol = Astore->colind[j];
+	    c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
+	}
+	++irow;
+    }
+
+    /* Find the global maximum for c[j] */
+    if ( !(loc_max = floatMalloc_dist(A->ncol)))
+      ABORT("Malloc fails for loc_max[].");
+    for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
+    MPI_Allreduce(loc_max, c, A->ncol, MPI_FLOAT, MPI_MAX, grid->comm);
+    SUPERLU_FREE(loc_max);
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* gather R from each process to get the global R.  */
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
+      ABORT("Malloc fails for r_sizes[].");
+    displs = r_sizes + procs;
+    if ( !(loc_r = floatMalloc_dist(m_loc)))
+      ABORT("Malloc fails for loc_r[].");
+    j = Astore->fst_row;
+    for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
+
+    /* First gather the size of each piece. */
+    MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
+
+    /* Set up the displacements for allgatherv */
+    displs[0] = 0;
+    for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
+
+    /* Now gather the actual data */
+    MPI_Allgatherv(loc_r, m_loc, MPI_FLOAT, r, r_sizes, displs,
+                MPI_FLOAT, grid->comm);
+
+    SUPERLU_FREE(r_sizes);
+    SUPERLU_FREE(loc_r);
+
+    return;
+
+} /* psgsequ */
diff --git a/SRC/psgsmv.c b/SRC/psgsmv.c
new file mode 100644
index 00000000..b0478a9d
--- /dev/null
+++ b/SRC/psgsmv.c
@@ -0,0 +1,383 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief  Parallel sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+void psgsmv_init
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input/output).
+			  The type of A can be:
+			  Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. */
+ int_t *row_to_proc,   /* Input. Mapping between rows and processes. */
+ gridinfo_t *grid,     /* Input */
+ psgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
+ )
+{
+    NRformat_loc *Astore;
+    int iam, p, procs;
+    int *SendCounts, *RecvCounts;
+    int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
+    int_t TotalIndSend, TotalValSend;
+    int_t *colind, *rowptr;
+    int_t *ind_tosend = NULL, *ind_torecv = NULL;
+    int_t *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, *spa, *itemp;
+    float *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psgsmv_init()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A->Store;
+    m = A->nrow;
+    n = A->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = Astore->nzval;
+    if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for SendCounts[]");
+    /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
+    RecvCounts = SendCounts + procs;
+    if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
+        ABORT("Malloc fails for ptr_ind_tosend[]");
+    ptr_ind_torecv = ptr_ind_tosend + procs + 1;
+    if ( !(extern_start = intMalloc_dist(m_loc)) )
+        ABORT("Malloc fails for extern_start[]");
+    for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
+
+    /* ------------------------------------------------------------
+       COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
+       THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
+       SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
+       LOCAL PART OF X.
+       THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
+        ABORT("Malloc fails for spa[]");
+    for (p = 0; p < procs; ++p) SendCounts[p] = 0;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+        k = extern_start[i];
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
+	    jcol = colind[j];
+            p = row_to_proc[jcol];
+	    if ( p != iam ) { /* External */
+	        if ( spa[jcol] == 0 ) { /* First time see this index */
+		    ++SendCounts[p];
+		    spa[jcol] = 1;
+                }
+	    } else { /* Swap to beginning the part of A corresponding
+			to the local part of X */
+		l = colind[k];
+		t = nzval[k];
+		colind[k] = jcol;
+		nzval[k] = nzval[j];
+		colind[j] = l;
+		nzval[j] = t;
+		++k;
+	    }
+	}
+	extern_start[i] = k;
+    }
+
+    /* ------------------------------------------------------------
+       LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
+       THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    /* Build pointers to ind_tosend[]. */
+    ptr_ind_tosend[0] = 0;
+    for (p = 0, TotalIndSend = 0; p < procs; ++p) {
+        TotalIndSend += SendCounts[p]; /* Total to send. */
+	ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
+    }
+#if 0
+    ptr_ind_tosend[iam] = 0; /* Local part of X */
+#endif
+    if ( TotalIndSend ) {
+        if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
+	    ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
+    }
+
+    /* Build SPA to aid global to local translation. */
+    for (i = 0; i < n; ++i) spa[i] = EMPTY;
+    for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    if ( spa[jcol] == EMPTY ) { /* First time see this index */
+	        p = row_to_proc[jcol];
+		if ( p == iam ) { /* Local */
+		  /*assert(jcol>=fst_row);*/
+		  spa[jcol] = jcol - fst_row; /* Relative position in local X */
+		} else {          /* External */
+		  ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
+		  spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
+		  ++ptr_ind_tosend[p];
+		}
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
+       THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
+       ------------------------------------------------------------*/
+    for (i = 0; i < m_loc; ++i) {
+        for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    jcol = colind[j];
+	    colind[j] = spa[jcol];
+	}
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE EXTERNAL INDICES OF X.
+       ------------------------------------------------------------*/
+    MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
+		 grid->comm);
+
+    /* Build pointers to ind_torecv[]. */
+    ptr_ind_torecv[0] = 0;
+    for (p = 0, TotalValSend = 0; p < procs; ++p) {
+        TotalValSend += RecvCounts[p]; /* Total to receive. */
+	ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
+    }
+    if ( TotalValSend ) {
+        if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
+	    ABORT("Malloc fails for ind_torecv[]");
+    }
+
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
+        if ( SendCounts[p] ) {
+	    MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
+		      mpi_int_t, p, iam, grid->comm, &send_req[p]);
+	}
+	if ( RecvCounts[p] ) {
+	    MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
+		      mpi_int_t, p, p, grid->comm, &recv_req[p]);
+	}
+    }
+    for (p = 0; p < procs; ++p) {
+        if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);
+	if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status);
+    }
+
+    /* Allocate storage for the X values to to transferred. */
+    if ( TotalIndSend &&
+         !(val_torecv = floatMalloc_dist(TotalIndSend)) )
+        ABORT("Malloc fails for val_torecv[].");
+    if ( TotalValSend &&
+         !(val_tosend = floatMalloc_dist(TotalValSend)) )
+        ABORT("Malloc fails for val_tosend[].");
+
+    gsmv_comm->extern_start = extern_start;
+    gsmv_comm->ind_tosend = ind_tosend;
+    gsmv_comm->ind_torecv = ind_torecv;
+    gsmv_comm->ptr_ind_tosend = ptr_ind_tosend;
+    gsmv_comm->ptr_ind_torecv = ptr_ind_torecv;
+    gsmv_comm->SendCounts = SendCounts;
+    gsmv_comm->RecvCounts = RecvCounts;
+    gsmv_comm->val_tosend = val_tosend;
+    gsmv_comm->val_torecv = val_torecv;
+    gsmv_comm->TotalIndSend = TotalIndSend;
+    gsmv_comm->TotalValSend = TotalValSend;
+
+    SUPERLU_FREE(spa);
+    SUPERLU_FREE(send_req);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("psgsmv_init::rowptr", m_loc+1, rowptr);
+    PrintInt10("psgsmv_init::extern_start", m_loc, extern_start);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgsmv_init()");
+#endif
+
+} /* PSGSMV_INIT */
+
+
+/*
+ * Performs sparse matrix-vector multiplication.
+ */
+void
+psgsmv
+(
+ int_t  abs,               /* Input. Do abs(A)*abs(x). */
+ SuperMatrix *A_internal,  /* Input. Matrix A permuted by columns.
+			      The column indices are translated into
+			      the relative positions in the gathered x-vector.
+			      The type of A can be:
+			      Stype = NR_loc; Dtype = SLU_S; Mtype = GE. */
+ gridinfo_t *grid,         /* Input */
+ psgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */
+ float x[],       /* Input. The distributed source vector */
+ float ax[]       /* Output. The distributed destination vector */
+)
+{
+    NRformat_loc *Astore;
+    int iam, procs;
+    int_t i, j, p, m, m_loc, n, fst_row, jcol;
+    int_t *colind, *rowptr;
+    int   *SendCounts, *RecvCounts;
+    int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv;
+    int_t *extern_start, TotalValSend;
+    float *nzval, *val_tosend, *val_torecv;
+    float zero = 0.0;
+    MPI_Request *send_req, *recv_req;
+    MPI_Status status;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psgsmv()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+    Astore = (NRformat_loc *) A_internal->Store;
+    m = A_internal->nrow;
+    n = A_internal->ncol;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    colind = Astore->colind;
+    rowptr = Astore->rowptr;
+    nzval = (float *) Astore->nzval;
+    extern_start = gsmv_comm->extern_start;
+    ind_torecv = gsmv_comm->ind_torecv;
+    ptr_ind_tosend = gsmv_comm->ptr_ind_tosend;
+    ptr_ind_torecv = gsmv_comm->ptr_ind_torecv;
+    SendCounts = gsmv_comm->SendCounts;
+    RecvCounts = gsmv_comm->RecvCounts;
+    val_tosend = (float *) gsmv_comm->val_tosend;
+    val_torecv = (float *) gsmv_comm->val_torecv;
+    TotalValSend = gsmv_comm->TotalValSend;
+
+    /* ------------------------------------------------------------
+       COPY THE X VALUES INTO THE SEND BUFFER.
+       ------------------------------------------------------------*/
+    for (i = 0; i < TotalValSend; ++i) {
+        j = ind_torecv[i] - fst_row; /* Relative index in x[] */
+	val_tosend[i] = x[j];
+    }
+
+    /* ------------------------------------------------------------
+       COMMUNICATE THE X VALUES.
+       ------------------------------------------------------------*/
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
+        ABORT("Malloc fails for recv_req[].");
+    recv_req = send_req + procs;
+    for (p = 0; p < procs; ++p) {
+        if ( RecvCounts[p] ) {
+	    MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p],
+                      MPI_FLOAT, p, iam,
+                      grid->comm, &send_req[p]);
+	}
+	if ( SendCounts[p] ) {
+	    MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p],
+                      MPI_FLOAT, p, p,
+                      grid->comm, &recv_req[p]);
+	}
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM THE ACTUAL MULTIPLICATION.
+       ------------------------------------------------------------*/
+    if ( abs ) { /* Perform abs(A)*abs(x) */
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = 0.0;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax[i] += fabs(nzval[j]) * fabs(x[jcol]);
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]);
+	    }
+	}
+    } else {
+        /* Multiply the local part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    ax[i] = zero;
+	    for (j = rowptr[i]; j < extern_start[i]; ++j) {
+	        jcol = colind[j];
+		ax[i] += nzval[j] * x[jcol];
+	    }
+        }
+
+        for (p = 0; p < procs; ++p) {
+            if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status);
+	    if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status);
+        }
+
+        /* Multiply the external part. */
+        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+	    for (j = extern_start[i]; j < rowptr[i+1]; ++j) {
+	        jcol = colind[j];
+	        ax[i] += nzval[j] * val_torecv[jcol];
+	    }
+	}
+    }
+
+    SUPERLU_FREE(send_req);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgsmv()");
+#endif
+
+} /* PSGSMV */
+
+void psgsmv_finalize(psgsmv_comm_t *gsmv_comm)
+{
+    int_t *it;
+    float *dt;
+    SUPERLU_FREE(gsmv_comm->extern_start);
+    if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it);
+    if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it);
+    SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
+    SUPERLU_FREE(gsmv_comm->SendCounts);
+    if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt);
+    if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt);
+}
+
diff --git a/SRC/psgsmv_AXglobal.c b/SRC/psgsmv_AXglobal.c
new file mode 100644
index 00000000..b81b3192
--- /dev/null
+++ b/SRC/psgsmv_AXglobal.c
@@ -0,0 +1,324 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs sparse matrix-vector multiplication
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+
+static void screate_msr_matrix(SuperMatrix *, int_t [], int_t,
+			      float **, int_t **);
+static void sPrintMSRmatrix(int, float [], int_t [], gridinfo_t *);
+
+
+int psgsmv_AXglobal_setup
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */
+ Glu_persist_t *Glu_persist, /* input */
+ gridinfo_t *grid,     /* input */
+ int_t *m,             /* output */
+ int_t *update[],      /* output */
+ float *val[],        /* output */
+ int_t *bindx[],       /* output */
+ int_t *mv_sup_to_proc /* output */
+ )
+{
+    int n;
+    int input_option;
+    int N_update;    /* Number of variables updated on this process (output) */
+    int iam = grid->iam;
+    int nprocs = grid->nprow * grid->npcol;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *supno = Glu_persist->supno;
+    int_t nsupers;
+    int i, nsup, p, t1, t2, t3;
+
+
+    /* Initialize the list of global indices.
+     * NOTE: the list of global indices must be in ascending order.
+     */
+    n = A->nrow;
+    input_option = SUPER_LINEAR;
+    nsupers = supno[n-1] + 1;
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) {
+	PrintInt10("xsup", supno[n-1]+1, xsup);
+	PrintInt10("supno", n, supno);
+    }
+#endif
+
+    if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on
+					     individual rows.  */
+	/* Figure out mv_sup_to_proc[] on all processes. */
+	for (p = 0; p < nprocs; ++p) {
+	    t1 = n / nprocs;       /* Number of rows */
+	    t2 = n - t1 * nprocs;  /* left-over, which will be assigned
+				      to the first t2 processes.  */
+	    if ( p >= t2 ) t2 += (p * t1); /* Starting row number */
+	    else { /* First t2 processes will get one more row. */
+ 	        ++t1;              /* Number of rows. */
+		t2 = p * t1;       /* Starting row. */
+	    }
+	    /* Make sure the starting and ending rows are at the
+	       supernode boundaries. */
+	    t3 = t2 + t1;      /* Ending row. */
+	    nsup = supno[t2];
+	    if ( t2 > xsup[nsup] ) { /* Round up the starting row. */
+		t1 -= xsup[nsup+1] - t2;
+		t2 = xsup[nsup+1];
+	    }
+	    nsup = supno[t3];
+	    if ( t3 > xsup[nsup] ) /* Round up the ending row. */
+		t1 += xsup[nsup+1] - t3;
+	    t3 = t2 + t1 - 1;
+	    if ( t1 ) {
+		for (i = supno[t2]; i <= supno[t3]; ++i) {
+		    mv_sup_to_proc[i] = p;
+#if ( DEBUGlevel>=3 )
+		    if ( mv_sup_to_proc[i] == p-1 ) {
+			fprintf(stderr,
+				"mv_sup_to_proc conflicts at supno %d\n", i);
+			exit(-1);
+		    }
+#endif
+		}
+	    }
+
+	    if ( iam == p ) {
+		N_update = t1;
+		if ( N_update ) {
+		    if ( !(*update = intMalloc_dist(N_update)) )
+			ABORT("Malloc fails for update[]");
+		}
+		for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i;
+#if ( DEBUGlevel>=3 )
+		printf("(%2d) N_update = %4d\t"
+		       "supers %4d to %4d\trows %4d to %4d\n",
+		       iam, N_update, supno[t2], supno[t3], t2, t3);
+#endif
+	    }
+	} /* for p ... */
+    } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on
+						   individual supernodes.  */
+	/* This may cause bad load balance, because the blocks are usually
+	   small in the beginning and large toward the end.   */
+	t1 = nsupers / nprocs;
+	t2 = nsupers - t1 * nprocs; /* left-over */
+	if ( iam >= t2 ) t2 += (iam * t1);
+	else {
+	    ++t1;          /* Number of blocks. */
+	    t2 = iam * t1; /* Starting block. */
+	}
+	N_update = xsup[t2+t1] - xsup[t2];
+	if ( !(*update = intMalloc_dist(N_update)) )
+	    ABORT("Malloc fails for update[]");
+	for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i;
+    }
+
+
+    /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */
+    screate_msr_matrix(A, *update, N_update, val, bindx);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc);
+    sPrintMSRmatrix(N_update, *val, *bindx, grid);
+#endif
+
+    *m = N_update;
+    return 0;
+} /* PSGSMV_AXglobal_SETUP */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Create the distributed modified sparse row (MSR) matrix: bindx/val.
+ * For a submatrix of size m-by-n, the MSR arrays are as follows:
+ *    bindx[0]      = m + 1
+ *    bindx[0..m]   = pointer to start of each row
+ *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
+ *                    where, ks = bindx[k], ke = bindx[k+1]-1
+ *    val[k]        = A(k,k), k < m, diagonal elements
+ *    val[m]        = not used
+ *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
+ * Both arrays are of length nnz + 1.
+ * </pre>
+*/
+static void screate_msr_matrix
+(
+ SuperMatrix *A,       /* Matrix A permuted by columns (input).
+			  The type of A can be:
+			  Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */
+ int_t update[],       /* input (local) */
+ int_t N_update,       /* input (local) */
+ float **val,         /* output */
+ int_t **bindx         /* output */
+)
+{
+    int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag;
+    NCPformat *Astore;
+    float *nzval;
+    int_t *rowcnt;
+    double zero = 0.0;
+
+    if ( !N_update ) return;
+
+    n = A->ncol;
+    Astore = A->Store;
+    nzval = Astore->nzval;
+
+    /* One pass of original matrix A to count nonzeros of each row. */
+    if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) )
+	ABORT("Malloc fails for rowcnt[]");
+    lo = update[0];
+    hi = update[N_update-1];
+    nnz_local = 0;
+    nnz_diag = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow != j ) /* Exclude diagonal */
+		    ++rowcnt[irow - lo];
+		else ++nnz_diag; /* Count nonzero diagonal entries */
+		++nnz_local;
+	    }
+	}
+    }
+
+    /* Add room for the logical diagonal zeros which are not counted
+       in nnz_local. */
+    nnz_local += (N_update - nnz_diag);
+
+    /* Allocate storage for bindx[] and val[]. */
+    if ( !(*val = (float *) floatMalloc_dist(nnz_local+1)) )
+	ABORT("Malloc fails for val[]");
+    for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */
+    if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) )
+	ABORT("Malloc fails for bindx[]");
+
+    /* Set up row pointers. */
+    (*bindx)[0] = N_update + 1;
+    for (j = 1; j <= N_update; ++j) {
+	(*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1];
+	rowcnt[j-1] = (*bindx)[j-1];
+    }
+
+    /* One pass of original matrix A to fill in matrix entries. */
+    for (j = 0; j < n; ++j) {
+	for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) {
+	    irow = Astore->rowind[i];
+	    if ( irow >= lo && irow <= hi ) {
+		if ( irow == j ) /* Diagonal */
+		    (*val)[irow - lo] = nzval[i];
+		else {
+		    irow -= lo;
+		    k = rowcnt[irow];
+		    (*bindx)[k] = j;
+		    (*val)[k] = nzval[i];
+		    ++rowcnt[irow];
+		}
+	    }
+	}
+    }
+
+    SUPERLU_FREE(rowcnt);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ * </pre>
+ */
+int
+psgsmv_AXglobal(int_t m, int_t update[], float val[], int_t bindx[],
+                float X[], float ax[])
+{
+    int_t i, j, k;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = 0.0;
+
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    ax[i] += val[k] * X[j];
+	}
+	ax[i] += val[i] * X[update[i]]; /* diagonal */
+    }
+    return 0;
+} /* PSGSMV_AXglobal */
+
+/*
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ */
+int
+psgsmv_AXglobal_abs(int_t m, int_t update[], float val[], int_t bindx[],
+	            float X[], float ax[])
+{
+    int_t i, j, k;
+
+    if ( m <= 0 ) return 0; /* number of rows (local) */
+
+    for (i = 0; i < m; ++i) {
+	ax[i] = 0.0;
+	for (k = bindx[i]; k < bindx[i+1]; ++k) {
+	    j = bindx[k];       /* column index */
+	    ax[i] += fabs(val[k]) * fabs(X[j]);
+	}
+	ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */
+    }
+
+    return 0;
+} /* PSGSMV_AXglobal_ABS */
+
+/*
+ * Print the local MSR matrix
+ */
+static void sPrintMSRmatrix
+(
+ int m,       /* Number of rows of the submatrix. */
+ float val[],
+ int_t bindx[],
+ gridinfo_t *grid
+)
+{
+    int iam, nnzp1;
+
+    if ( !m ) return;
+
+    iam = grid->iam;
+    nnzp1 = bindx[m];
+    printf("(%2d) MSR submatrix has %d rows -->\n", iam, m);
+    Printfloat5("val", nnzp1, val);
+    PrintInt10("bindx", nnzp1, bindx);
+}
diff --git a/SRC/psgsrfs.c b/SRC/psgsrfs.c
new file mode 100644
index 00000000..db734585
--- /dev/null
+++ b/SRC/psgsrfs.c
@@ -0,0 +1,260 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Last modified:
+ * December 31, 2015
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSRFS improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (local)
+ *        The m_loc-by-NRHS right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (local)
+ *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
+ *            transformed system A1*Y = Pc*Pr*B. where
+ *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
+ *        On exit, the improved solution matrix Y.
+ *
+ *        In order to obtain the solution X to the original system,
+ *        Y should be permutated by Pc^T, and premultiplied by diag(C)
+ *        if DiagScale = COL or BOTH.
+ *        This must be done after this routine is called.
+ *
+ * ldx    (input) int (local)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (output) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * berr   (output) float*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * </pre>
+ */
+void
+psgsrfs(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct,
+	sScalePermstruct_t *ScalePermstruct, gridinfo_t *grid,
+	float *B, int_t ldb, float *X, int_t ldx, int nrhs,
+	sSOLVEstruct_t *SOLVEstruct,
+	float *berr, SuperLUStat_t *stat, int *info)
+{
+#define ITMAX 20
+
+    float *ax, *R, *dx, *temp, *work, *B_col, *X_col;
+    int_t count, i, j, lwork, nz;
+    int   iam;
+    float eps, lstres;
+    float s, safmin, safe1, safe2;
+
+    /* Data structures used by matrix-vector multiply routine. */
+    psgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
+    NRformat_loc *Astore;
+    int_t        m_loc, fst_row;
+
+
+    /* Initialization. */
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    iam = grid->iam;
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	      || A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("PSGSRFS", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgsrfs()");
+#endif
+
+    lwork = 2 * m_loc;  /* For ax/R/dx and temp */
+    if ( !(work = floatMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = dx = work;
+    temp = ax + m_loc;
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = smach_dist("Epsilon");
+    safmin = smach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    psgsmv(0, A, grid, gsmv_comm, X_col, ax);
+
+	    /* Compute residual, stored in R[]. */
+	    for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i];
+
+	    /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */
+	    psgsmv(1, A, grid, gsmv_comm, X_col, temp);
+	    for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]);
+
+	    s = 0.0;
+	    for (i = 0; i < m_loc; ++i) {
+		if ( temp[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
+		} else if ( temp[i] != 0.0 ) {
+                    /* Adding SAFE1 to the numerator guards against
+                       spuriously zero residuals (underflow). */
+                    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_FLOAT, MPI_MAX, grid->comm );
+
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		psgstrs(n, LUstruct, ScalePermstruct, grid,
+			dx, m_loc, fst_row, m_loc, 1,
+			SOLVEstruct, stat, info);
+
+		/* Update solution. */
+		for (i = 0; i < m_loc; ++i) X_col[i] += dx[i];
+
+		lstres = berr[j];
+		++count;
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+    /* Deallocate storage. */
+    SUPERLU_FREE(work);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgsrfs()");
+#endif
+
+} /* PSGSRFS */
+
diff --git a/SRC/psgsrfs_ABXglobal.c b/SRC/psgsrfs_ABXglobal.c
new file mode 100644
index 00000000..989f591d
--- /dev/null
+++ b/SRC/psgsrfs_ABXglobal.c
@@ -0,0 +1,465 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Improves the computed solution and provies error bounds
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015  version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*-- Function prototypes --*/
+static void gather_1rhs_diag_to_all(int_t, float [], Glu_persist_t *,
+                                    sLocalLU_t *, gridinfo_t *, int_t, int_t [],
+				    int_t [], float [], float []);
+static void redist_all_to_diag(int_t, float [], Glu_persist_t *,
+                               sLocalLU_t *, gridinfo_t *, int_t [], float []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * psgsrfs_ABXglobal improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
+ *        are permutation matrices. The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ *        NOTE: Currently, A must reside in all processes when calling
+ *              this routine.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (global)
+ *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *
+ *        NOTE: Currently, B must reside on all processes when calling
+ *              this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (global)
+ *        On entry, the solution matrix X, as computed by PSGSTRS.
+ *        On exit, the improved solution matrix X.
+ *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
+ *        in order to obtain the solution to the original system.
+ *
+ *        NOTE: Currently, X must reside on all processes when calling
+ *              this routine.
+ *
+ * ldx    (input) int (global)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * </pre>
+ */
+
+void
+psgsrfs_ABXglobal(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct,
+		  gridinfo_t *grid, float *B, int_t ldb, float *X, int_t ldx,
+		  int nrhs, float *berr, SuperLUStat_t *stat, int *info)
+{
+
+
+#define ITMAX 20
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    /*
+     * Data structures used by matrix-vector multiply routine.
+     */
+    int_t  N_update; /* Number of variables updated on this process */
+    int_t  *update;  /* vector elements (global index) updated
+			on this processor.                     */
+    int_t  *bindx;
+    float *val;
+    int_t *mv_sup_to_proc;  /* Supernode to process mapping in
+			       matrix-vector multiply.  */
+    /*-- end data structures for matrix-vector multiply --*/
+    float *b, *ax, *R, *B_col, *temp, *work, *X_col,
+           *x_trs, *dx_trs;
+    int_t count, ii, j, jj, k, knsupc, lk, lwork,
+          nprow, nsupers, nz, p;
+    int   i, iam, pkk;
+    int_t *ilsum, *xsup;
+    double eps, lstres;
+    double s, safmin, safe1, safe2;
+
+    /* NEW STUFF */
+    int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */
+    int_t *diag_len; /* Length of the X vector on diagonal processes. */
+
+    /*-- Function prototypes --*/
+    extern void psgstrs1(int_t, sLUstruct_t *, gridinfo_t *,
+			 float *, int, SuperLUStat_t *, int *);
+
+    /* Test the input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 ||
+	      A->Stype != SLU_NCP || A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10;
+    else if ( ldx < SUPERLU_MAX(0, n) )	*info = -12;
+    else if ( nrhs < 0 ) *info = -13;
+    if (*info != 0) {
+	i = -(*info);
+	pxerr_dist("psgsrfs_ABXglobal", grid, i);
+	return;
+    }
+
+    /* Quick return if possible. */
+    if ( n == 0 || nrhs == 0 ) {
+	return;
+    }
+
+    /* Initialization. */
+    iam = grid->iam;
+    nprow = grid->nprow;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgsrfs_ABXglobal()");
+#endif
+
+    get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		   &diag_procs, &diag_len);
+#if ( PRNTlevel>=1 )
+    if ( !iam ) {
+	printf(".. number of diag processes = " IFMT "\n", num_diag_procs);
+	PrintInt10("diag_procs", num_diag_procs, diag_procs);
+	PrintInt10("diag_len", num_diag_procs, diag_len);
+    }
+#endif
+
+    if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) )
+	ABORT("Calloc fails for mv_sup_to_proc[]");
+
+    psgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update,
+		          &val, &bindx, mv_sup_to_proc);
+
+    i = CEILING( nsupers, nprow ); /* Number of local block rows */
+    ii = Llu->ldalsum + i * XK_H;
+    k = SUPERLU_MAX(N_update, sp_ienv_dist(3));
+    jj = diag_len[0];
+    for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
+    jj = SUPERLU_MAX( jj, N_update );
+    lwork = N_update         /* For ax and R */
+	  + ii               /* For dx_trs */
+	  + ii               /* For x_trs */
+          + k                /* For b */
+	  + jj;              /* for temp */
+    if ( !(work = floatMalloc_dist(lwork)) )
+	ABORT("Malloc fails for work[]");
+    ax = R = work;
+    dx_trs = work + N_update;
+    x_trs  = dx_trs + ii;
+    b      = x_trs + ii;
+    temp   = b + k;
+
+#if ( DEBUGlevel>=2 )
+    {
+	float *dwork = floatMalloc_dist(n);
+	for (i = 0; i < n; ++i) {
+	    if ( i & 1 ) dwork[i] = 1.;
+	    else dwork[i] = 2.;
+        }
+	/* Check correctness of matrix-vector multiply. */
+	psgsmv_AXglobal(N_update, update, val, bindx, dwork, ax);
+	Printfloat5("Mult A*x", N_update, ax);
+	SUPERLU_FREE(dwork);
+    }
+#endif
+
+
+    /* NZ = maximum number of nonzero elements in each row of A, plus 1 */
+    nz     = A->ncol + 1;
+    eps    = smach_dist("Epsilon");
+    safmin = smach_dist("Safe minimum");
+
+    /* Set SAFE1 essentially to be the underflow threshold times the
+       number of additions in each row. */
+    safe1  = nz * safmin;
+    safe2  = safe1 / eps;
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n",
+		       eps, anorm, safe1, safe2);
+#endif
+
+    /* Do for each right-hand side ... */
+    for (j = 0; j < nrhs; ++j) {
+	count = 0;
+	lstres = 3.;
+
+	/* Copy X into x on the diagonal processes. */
+	B_col = &B[j*ldb];
+	X_col = &X[j*ldx];
+	for (p = 0; p < num_diag_procs; ++p) {
+	    pkk = diag_procs[p];
+	    if ( iam == pkk ) {
+		for (k = p; k < nsupers; k += num_diag_procs) {
+		    knsupc = SuperSize( k );
+		    lk = LBi( k, grid );
+		    ii = ilsum[lk] + (lk+1)*XK_H;
+		    jj = FstBlockC( k );
+		    for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj];
+		    dx_trs[ii-XK_H] = k;/* Block number prepended in header. */
+		}
+	    }
+	}
+	/* Copy B into b distributed the same way as matrix-vector product. */
+        if ( N_update ) ii = update[0];
+	for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii];
+
+	while (1) { /* Loop until stopping criterion is satisfied. */
+
+	    /* Compute residual R = B - op(A) * X,
+	       where op(A) = A, A**T, or A**H, depending on TRANS. */
+
+	    /* Matrix-vector multiply. */
+	    psgsmv_AXglobal(N_update, update, val, bindx, X_col, ax);
+
+	    /* Compute residual. */
+	    for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i];
+
+	    /* Compute abs(op(A))*abs(X) + abs(B). */
+	    psgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp);
+	    for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]);
+
+	    s = 0.0;
+	    for (i = 0; i < N_update; ++i) {
+		if ( temp[i] > safe2 ) {
+		    s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]);
+		} else if ( temp[i] != 0.0 ) {
+                    /* Adding SAFE1 to the numerator guards against
+                       spuriously zero residuals (underflow). */
+		    s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]);
+                }
+                /* If temp[i] is exactly 0.0 (computed by PxGSMV), then
+                   we know the true residual also must be exactly 0.0. */
+	    }
+	    MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm );
+
+#if ( PRNTlevel>= 1 )
+	    if ( !iam )
+		printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]);
+#endif
+	    if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) {
+		/* Compute new dx. */
+		redist_all_to_diag(n, R, Glu_persist, Llu, grid,
+				   mv_sup_to_proc, dx_trs);
+		psgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info);
+
+		/* Update solution. */
+		for (p = 0; p < num_diag_procs; ++p)
+		    if ( iam == diag_procs[p] )
+			for (k = p; k < nsupers; k += num_diag_procs) {
+			    lk = LBi( k, grid );
+			    ii = ilsum[lk] + (lk+1)*XK_H;
+			    knsupc = SuperSize( k );
+			    for (i = 0; i < knsupc; ++i)
+				x_trs[i + ii] += dx_trs[i + ii];
+			}
+		lstres = berr[j];
+		++count;
+		/* Transfer x_trs (on diagonal processes) into X
+		   (on all processes). */
+		gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid,
+					num_diag_procs, diag_procs, diag_len,
+					X_col, temp);
+	    } else {
+		break;
+	    }
+	} /* end while */
+
+	stat->RefineSteps = count;
+
+    } /* for j ... */
+
+
+    /* Deallocate storage used by matrix-vector multiplication. */
+    SUPERLU_FREE(diag_procs);
+    SUPERLU_FREE(diag_len);
+    if ( N_update ) {
+	SUPERLU_FREE(update);
+	SUPERLU_FREE(bindx);
+	SUPERLU_FREE(val);
+    }
+    SUPERLU_FREE(mv_sup_to_proc);
+    SUPERLU_FREE(work);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgsrfs_ABXglobal()");
+#endif
+
+} /* PSGSRFS_ABXGLOBAL */
+
+
+/*! \brief
+ *
+ * <pre>
+ * r[] is the residual vector distributed the same way as
+ * matrix-vector product.
+ * </pre>
+ */
+static void
+redist_all_to_diag(int_t n, float r[], Glu_persist_t *Glu_persist,
+		   sLocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[],
+		   float work[])
+{
+    int_t i, ii, k, lk, lr, nsupers;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, psrc, pkk;
+    MPI_Status status;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+    lr = 0;
+
+    for (k = 0; k < nsupers; ++k) {
+	pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid );
+	psrc = mv_sup_to_proc[k];
+	knsupc = SuperSize( k );
+	lk = LBi( k, grid );
+	ii = ilsum[lk] + (lk+1)*XK_H;
+	if ( iam == psrc ) {
+	    if ( iam != pkk ) { /* Send X component. */
+		MPI_Send( &r[lr], knsupc, MPI_FLOAT, pkk, Xk,
+			 grid->comm );
+	    } else { /* Local copy. */
+		for (i = 0; i < knsupc; ++i)
+		    work[i + ii] = r[i + lr];
+	    }
+	    lr += knsupc;
+	} else {
+	    if ( iam == pkk ) { /* Recv X component. */
+		MPI_Recv( &work[ii], knsupc, MPI_FLOAT, psrc, Xk,
+			 grid->comm, &status );
+	    }
+	}
+    }
+} /* REDIST_ALL_TO_DIAG */
+
+
+/*! \brief
+ *
+ * <pre>
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * </pre>
+ */
+static void
+gather_1rhs_diag_to_all(int_t n, float x[],
+			Glu_persist_t *Glu_persist, sLocalLU_t *Llu,
+			gridinfo_t *grid, int_t num_diag_procs,
+			int_t diag_procs[], int_t diag_len[],
+			float y[], float work[])
+{
+    int_t i, ii, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = ilsum[lk] + (lk+1)*XK_H;
+		for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii];
+		lwork += knsupc;
+	    }
+	    MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p], MPI_FLOAT, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork];
+	    lwork += knsupc;
+	}
+    }
+} /* GATHER_1RHS_DIAG_TO_ALL */
+
diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c
new file mode 100644
index 00000000..73020a02
--- /dev/null
+++ b/SRC/psgssvx.c
@@ -0,0 +1,1579 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * October  1, 2014
+ * April 5, 2015
+ * December 31, 2015  version 4.3
+ * December 31, 2016  version 5.1.3
+ * April 10, 2018  version 5.3
+ * September 18, 2018  version 6.0
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinement is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be initialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * </pre>
+ */
+
+void
+psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
+	sScalePermstruct_t *ScalePermstruct,
+	float B[], int ldb, int nrhs, gridinfo_t *grid,
+	sLUstruct_t *LUstruct, sSOLVEstruct_t *SOLVEstruct, float *berr,
+	SuperLUStat_t *stat, int *info)
+{
+    NRformat_loc *Astore;
+    SuperMatrix GA;      /* Global A in NC format */
+    NCformat *GAstore;
+    float   *a_GA;
+    SuperMatrix GAC;      /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are
+	      computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+	      routine. They will be freed after PDDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    float   *a;
+    int_t    *colptr, *rowind;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *rowptr, *colind;  /* Local A in NR*/
+    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t    nnz_loc, m_loc, fst_row, icol;
+    int      iam,iam_g;
+    int      ldx;  /* LDA for matrix X (local). */
+    char     equed[1], norm[1];
+    float   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    float   *X, *b_col, *b_work, *x_col;
+    double   t;
+    float    GA_mem_use = 0.0;    /* memory usage by global A */
+    float    dist_mem_use = 0.0;  /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+    int64_t  nnzLU;
+    int_t    nnz_tot;
+    float *nzval_a;
+    float asum,asum_tot,lsum,lsum_tot;
+    int_t nsupers,nsupers_j;
+    int_t lk,k,knsupc,nsupr;
+    int_t  *lsub,*xsup;
+    float *lusup;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+
+	LUstruct->dt = 's';
+
+    /* Structures needed for parallel symbolic factorization */
+    int_t *sizes, *fstVtxSep, parSymbFact;
+    int   noDomains, nprocs_num;
+    MPI_Comm symb_comm; /* communicator for symbolic factorization */
+    int   col, key; /* parameters for creating a new communicator */
+    Pslu_freeable_t Pslu_freeable;
+    float  flinfo;
+
+    /* Initialization. */
+    m       = A->nrow;
+    n       = A->ncol;
+    Astore  = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc   = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a       = (float *) Astore->nzval;
+    rowptr  = Astore->rowptr;
+    colind  = Astore->colind;
+    sizes   = NULL;
+    fstVtxSep = NULL;
+    symb_comm = MPI_COMM_NULL;
+    num_mem_usage.for_lu = num_mem_usage.total = 0.0;
+    symb_mem_usage.for_lu = symb_mem_usage.total = 0.0;
+
+    /* Test the input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < DOFACT || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	printf("ERROR: Extra precise iterative refinement yet to support.\n");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+		|| A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < m_loc )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( sp_ienv_dist(2) > sp_ienv_dist(3) ) {
+        *info = 1;
+	printf("ERROR: Relaxation (NREL) cannot be larger than max. supernode size (NSUP).\n"
+	"\t-> Check parameter setting in sp_ienv_dist.c to correct error.\n");
+    }
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("psgssvx", grid, -*info);
+	return;
+    }
+
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    parSymbFact = options->ParSymbFact;
+
+    iam = grid->iam;
+    job = 5;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+    /* The following arrays are replicated on all processes. */
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    /********/
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgssvx()");
+#endif
+
+    /* Not factored & ask for equilibration */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (float *) floatMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (float *) floatMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW:
+	        if ( !(C = (float *) floatMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL:
+		if ( !(R = (float *) floatMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	    default: break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+     * Diagonal scaling to equilibrate the matrix. (simple scheme)
+     *   for row i = 1:n,  A(i,:) <- A(i,:) / max(abs(A(i,:));
+     *   for column j = 1:n,  A(:,j) <- A(:, j) / max(abs(A(:,j))
+     * ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			a[i] *= R[irow];       /* Scale rows. */
+		    }
+		    ++irow;
+		}
+		break;
+	      case COL:
+		for (j = 0; j < m_loc; ++j)
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i){
+		        icol = colind[i];
+			a[i] *= C[icol];          /* Scale columns. */
+		    }
+		break;
+	      case BOTH:
+		irow = fst_row;
+		for (j = 0; j < m_loc; ++j) {
+		    for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			icol = colind[i];
+			a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
+		    }
+		    ++irow;
+		}
+	        break;
+	    }
+	} else { /* Compute R & C from scratch */
+            /* Compute the row and column scalings. */
+	    psgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+
+	    if ( iinfo > 0 ) {
+		if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		} else {
+#if ( PRNTlevel>=1 )
+                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+                }
+ 	    } else if ( iinfo < 0 ) return;
+
+	    /* Now iinfo == 0 */
+
+            /* Equilibrate matrix A if it is badly-scaled.
+               A <-- diag(R)*A*diag(C)                     */
+	    pslaqgs(A, R, C, rowcnd, colcnd, amax, equed);
+
+	    if ( strncmp(equed, "R", 1)==0 ) {
+		  ScalePermstruct->DiagScale = ROW;
+		  rowequ = ROW;
+	    } else if ( strncmp(equed, "C", 1)==0 ) {
+		  ScalePermstruct->DiagScale = COL;
+		  colequ = COL;
+	    } else if ( strncmp(equed, "B", 1)==0 ) {
+		  ScalePermstruct->DiagScale = BOTH;
+		  rowequ = ROW;
+		  colequ = COL;
+	    } else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		fflush(stdout);
+	    }
+#endif
+	} /* end if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... LAPACK style, not involving MC64 */
+
+    if ( !factored ) { /* Skip this if already factored. */
+        /*
+         * For serial symbolic factorization, gather A from the distributed
+	 * compressed row format to global A in compressed column format.
+         * Numerical values are gathered only when a row permutation
+         * for large diagonal is sought after.
+         */
+	if ( Fact != SamePattern_SameRowPerm &&
+             (parSymbFact == NO || options->RowPerm != NO) ) {
+             /* Performs serial symbolic factorzation and/or MC64 */
+
+            need_value = (options->RowPerm == LargeDiag_MC64);
+
+            psCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
+
+            GAstore = (NCformat *) GA.Store;
+            colptr = GAstore->colptr;
+            rowind = GAstore->rowind;
+            nnz = GAstore->nnz;
+            GA_mem_use = (nnz + n + 1) * sizeof(int_t);
+
+            if ( need_value ) {
+                a_GA = (float *) GAstore->nzval;
+                GA_mem_use += nnz * sizeof(float);
+            } else assert(GAstore->nzval == NULL);
+	}
+
+        /* ------------------------------------------------------------
+           Find the row permutation Pr for A, and apply Pr*[GA].
+	   GA is overwritten by Pr*[GA].
+           ------------------------------------------------------------*/
+        if ( options->RowPerm != NO ) {
+	    t = SuperLU_timer_();
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */
+	            /* Permute the global matrix GA for symbfact() */
+	            for (i = 0; i < colptr[n]; ++i) {
+	            	irow = rowind[i];
+		    	rowind[i] = perm_r[irow];
+	            }
+	        } else if ( options->RowPerm == LargeDiag_MC64 ) {
+	            /* Get a new perm_r[] from MC64 */
+	            if ( job == 5 ) {
+		        /* Allocate storage for scaling factors. */
+		        if ( !(R1 = floatMalloc_dist(m)) )
+		            ABORT("SUPERLU_MALLOC fails for R1[]");
+		    	if ( !(C1 = floatMalloc_dist(n)) )
+		            ABORT("SUPERLU_MALLOC fails for C1[]");
+	            }
+
+	            if ( !iam ) { /* Process 0 finds a row permutation */
+		        iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a_GA,
+		                perm_r, R1, C1);
+
+                        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		        if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm );
+                            }
+		        }
+	            } else {
+		        MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			if ( iinfo == 0 ) {
+		            MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		            if ( job == 5 && Equil ) {
+		                MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm );
+		                MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm );
+                            }
+		        }
+	            }
+
+	    	    if ( iinfo && job == 5) { /* Error return */
+	                SUPERLU_FREE(R1);
+	        	SUPERLU_FREE(C1);
+   	            }
+#if ( PRNTlevel>=2 )
+	            dmin = smach_dist("Overflow");
+	            dsum = 0.0;
+	            dprod = 1.0;
+#endif
+	            if ( iinfo == 0 ) {
+	              if ( job == 5 ) {
+		        if ( Equil ) {
+		            for (i = 0; i < n; ++i) {
+			        R1[i] = exp(R1[i]);
+			        C1[i] = exp(C1[i]);
+		            }
+
+		            /* Scale the distributed matrix further.
+			       A <-- diag(R1)*A*diag(C1)            */
+		            irow = fst_row;
+		            for (j = 0; j < m_loc; ++j) {
+			        for (i = rowptr[j]; i < rowptr[j+1]; ++i) {
+			            icol = colind[i];
+			            a[i] *= R1[irow] * C1[icol];
+#if ( PRNTlevel>=2 )
+			            if ( perm_r[irow] == icol ) { /* New diagonal */
+			              if ( job == 2 || job == 3 )
+				        dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+			              else if ( job == 4 )
+				        dsum += fabs(a[i]);
+			              else if ( job == 5 )
+				        dprod *= fabs(a[i]);
+			            }
+#endif
+			        }
+			        ++irow;
+		            }
+
+		            /* Multiply together the scaling factors --
+			       R/C from simple scheme, R1/C1 from MC64. */
+		            if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		            else for (i = 0; i < m; ++i) R[i] = R1[i];
+		            if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		            else for (i = 0; i < n; ++i) C[i] = C1[i];
+
+		            ScalePermstruct->DiagScale = BOTH;
+		            rowequ = colequ = 1;
+
+		        } /* end Equil */
+
+                        /* Now permute global GA to prepare for symbfact() */
+                        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	                        irow = rowind[i];
+		                rowind[i] = perm_r[irow];
+		            }
+		        }
+		        SUPERLU_FREE (R1);
+		        SUPERLU_FREE (C1);
+	              } else { /* job = 2,3,4 */
+		        for (j = 0; j < n; ++j) {
+		            for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			        irow = rowind[i];
+			        rowind[i] = perm_r[irow];
+		            } /* end for i ... */
+		        } /* end for j ... */
+	              } /* end else job ... */
+                    } else { /* if iinfo != 0 */
+			for (i = 0; i < m; ++i) perm_r[i] = i;
+		    }
+
+#if ( PRNTlevel>=2 )
+	            if ( job == 2 || job == 3 ) {
+		        if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	            } else if ( job == 4 ) {
+		        if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	            } else if ( job == 5 ) {
+		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	            }
+#endif
+                } else { /* use LargeDiag_HWPM */
+#ifdef HAVE_COMBBLAS
+		    s_c2cpp_GetHWPM(A, grid, ScalePermstruct);
+#else
+		    if ( iam == 0 ) {
+		        printf("CombBLAS is not available\n"); fflush(stdout);
+		    }
+#endif
+                } /* end if options->RowPerm ... */
+
+	        t = SuperLU_timer_() - t;
+	        stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+                if ( !iam ) {
+		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    fflush(stdout);
+		}
+#endif
+            } /* end if Fact ... */
+
+        } else { /* options->RowPerm == NOROWPERM / NATURAL */
+            for (i = 0; i < m; ++i) perm_r[i] = i;
+        }
+
+#if ( DEBUGlevel>=2 )
+        if ( !iam ) PrintInt10("perm_r",  m, perm_r);
+#endif
+    } /* end if (!factored) */
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = pslangs(norm, A, grid);
+#if ( PRNTlevel>=1 )
+	if ( !iam ) { printf(".. anorm %e\n", anorm); 	fflush(stdout); }
+#endif
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization: symbolic factorization,
+       redistribution, and numerical factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+
+	if ( parSymbFact == YES || permc_spec == PARMETIS ) {
+	    nprocs_num = grid->nprow * grid->npcol;
+  	    noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num ))));
+
+	    /* create a new communicator for the first noDomains
+               processes in grid->comm */
+	    key = iam;
+    	    if (iam < noDomains) col = 0;
+	    else col = MPI_UNDEFINED;
+	    MPI_Comm_split (grid->comm, col, key, &symb_comm );
+
+	    if ( permc_spec == NATURAL || permc_spec == MY_PERMC ) {
+		if ( permc_spec == NATURAL ) {
+		     for (j = 0; j < n; ++j) perm_c[j] = j;
+                }
+		if ( !(sizes = intMalloc_dist(2 * noDomains)) )
+		     ABORT("SUPERLU_MALLOC fails for sizes.");
+		if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) )
+		    ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+		for (i = 0; i < 2*noDomains - 2; ++i) {
+		    sizes[i] = 0;
+		    fstVtxSep[i] = 0;
+		}
+		sizes[2*noDomains - 2] = m;
+		fstVtxSep[2*noDomains - 2] = 0;
+	    } else if ( permc_spec != PARMETIS ) {   /* same as before */
+		printf("{" IFMT "," IFMT "}: psgssvx: invalid ColPerm option when ParSymbfact is used\n",
+		       MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
+	    }
+        }
+
+	if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
+          /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
+	  if ( permc_spec == PARMETIS ) {
+	// #pragma omp parallel
+    // {
+	// #pragma omp master
+	// {
+	      /* Get column permutation vector in perm_c.                    *
+	       * This routine takes as input the distributed input matrix A  *
+	       * and does not modify it.  It also allocates memory for       *
+	       * sizes[] and fstVtxSep[] arrays, that contain information    *
+	       * on the separator tree computed by ParMETIS.                 */
+	      flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
+                                  	   noDomains, &sizes, &fstVtxSep,
+                                           grid, &symb_comm);
+	// }
+	// }
+	      if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	          fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
+#endif
+		  *info = flinfo;
+		  return;
+     	      }
+	  } else {
+	      get_perm_c_dist(iam, permc_spec, &GA, perm_c);
+          }
+        }
+
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Symbolic factorization. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    if ( parSymbFact == NO ) { /* Perform serial symbolic factorization */
+		/* GA = Pr*A, perm_r[] is already applied. */
+	        int_t *GACcolbeg, *GACcolend, *GACrowind;
+
+	        /* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T
+	           (a.k.a. column etree), depending on the choice of ColPerm.
+	           Adjust perm_c[] to be consistent with a postorder of etree.
+	           Permute columns of A to form A*Pc'.
+		   After this routine, GAC = GA*Pc^T.  */
+	        sp_colorder(options, &GA, perm_c, etree, &GAC);
+
+	        /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */
+	        GACstore = (NCPformat *) GAC.Store;
+	        GACcolbeg = GACstore->colbeg;
+	        GACcolend = GACstore->colend;
+	        GACrowind = GACstore->rowind;
+	        for (j = 0; j < n; ++j) {
+	            for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+		        irow = GACrowind[i];
+		        GACrowind[i] = perm_c[irow];
+	            }
+	        }
+
+	        /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up
+                   the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 )
+                if ( !iam ) {
+		    printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
+		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+		    fflush(stdout);
+	        }
+#endif
+  	        t = SuperLU_timer_();
+	        if ( !(Glu_freeable = (Glu_freeable_t *)
+		      SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		    ABORT("Malloc fails for Glu_freeable.");
+
+	    	/* Every process does this. */
+	    	iinfo = symbfact(options, iam, &GAC, perm_c, etree,
+			     	 Glu_persist, Glu_freeable);
+			nnzLU = Glu_freeable->nnzLU;
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if ( iinfo <= 0 ) { /* Successful return */
+		    QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+		    	printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
+		    	printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
+		    	printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
+		    	printf("\tint %lu, short %lu, float %lu, double %lu\n",
+			        sizeof(int_t), sizeof(short),
+        		        sizeof(float), sizeof(double));
+		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+			   	symb_mem_usage.for_lu*1e-6,
+			   	symb_mem_usage.total*1e-6,
+			   	symb_mem_usage.expansions);
+			fflush(stdout);
+		    }
+#endif
+	    	} else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		    if ( !iam )
+		        fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
+#endif
+		    *info = iinfo;
+		    return;
+	        }
+	    } /* end serial symbolic factorization */
+	    else {  /* parallel symbolic factorization */
+	    	t = SuperLU_timer_();
+	    	flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable,
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage);
+			nnzLU = Pslu_freeable.nnzLU;
+	    	stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+	    	if (flinfo > 0) {
+#if ( PRNTlevel>=1 )
+	      	    fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
+#endif
+		    *info = flinfo;
+		    return;
+                }
+	    }
+
+            /* Destroy global GA */
+            if ( parSymbFact == NO || options->RowPerm != NO )
+                Destroy_CompCol_Matrix_dist(&GA);
+            if ( parSymbFact == NO )
+ 	        Destroy_CompCol_Permuted_dist(&GAC);
+
+	} /* end if Fact != SamePattern_SameRowPerm ... */
+
+        if (sizes) SUPERLU_FREE (sizes);
+        if (fstVtxSep) SUPERLU_FREE (fstVtxSep);
+	if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm);
+
+	/* Distribute entries of A into L & U data structures. */
+	//if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) {
+	if ( parSymbFact == NO ) {
+	    /* CASE OF SERIAL SYMBOLIC */
+  	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage.
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+  	       distribution routine. */
+	    t = SuperLU_timer_();
+	    dist_mem_use = psdistribute(Fact, n, A, ScalePermstruct,
+                                      Glu_freeable, LUstruct, grid);
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+
+  	    /* Deallocate storage used in symbolic factorization. */
+	    if ( Fact != SamePattern_SameRowPerm ) {
+	        iinfo = symbfact_SubFree(Glu_freeable);
+	        SUPERLU_FREE(Glu_freeable);
+	    }
+	} else { /* CASE OF PARALLEL SYMBOLIC */
+	    /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+	       NOTE: the row permutation Pc*Pr is applied internally in the
+	       distribution routine. */
+	    /* Apply column permutation to the original distributed A */
+	    for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]];
+
+    	    t = SuperLU_timer_();
+	    dist_mem_use = sdist_psymbtonum(Fact, n, A, ScalePermstruct,
+		  			   &Pslu_freeable, LUstruct, grid);
+	    if (dist_mem_use > 0)
+	        ABORT ("Not enough memory available for dist_psymbtonum\n");
+
+	    stat->utime[DIST] = SuperLU_timer_() - t;
+	}
+
+	/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]);*/
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+    // #pragma omp parallel
+    // {
+	// #pragma omp master
+	// {
+	psgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+	// }
+	// }
+
+
+#if ( PRNTlevel>=2 )
+    /* ------------------------------------------------------------
+       SUM OVER ALL ENTRIES OF A AND PRINT NNZ AND SIZE OF A.
+       ------------------------------------------------------------*/
+    Astore = (NRformat_loc *) A->Store;
+	xsup = Glu_persist->xsup;
+	nzval_a = Astore->nzval;
+
+
+	asum=0;
+    for (i = 0; i < Astore->m_loc; ++i) {
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	    asum += nzval_a[j];
+	}
+    }
+
+	nsupers = Glu_persist->supno[n-1] + 1;
+	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+
+
+	lsum=0.0;
+	for (lk=0;lk<nsupers_j;++lk){
+		lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+		lusup = LUstruct->Llu->Lnzval_bc_ptr[lk];
+		if(lsub){
+			k = MYCOL(grid->iam, grid)+lk*grid->npcol;  /* not sure */
+			knsupc = SuperSize( k );
+			nsupr = lsub[1];
+			for (j=0; j<knsupc; ++j)
+				for (i = 0; i < nsupr; ++i)
+					lsum +=lusup[j*nsupr+i];
+		}
+	}
+
+
+	MPI_Allreduce( &asum, &asum_tot,1, MPI_FLOAT, MPI_SUM, grid->comm );
+	MPI_Allreduce( &lsum, &lsum_tot,1, MPI_FLOAT, MPI_SUM, grid->comm );
+
+
+	MPI_Allreduce( &Astore->rowptr[Astore->m_loc], &nnz_tot,1, mpi_int_t, MPI_SUM, grid->comm );
+	// MPI_Bcast( &nnzLU, 1, mpi_int_t, 0, grid->comm );
+
+	MPI_Comm_rank( MPI_COMM_WORLD, &iam_g );
+
+    printf(".. Ainfo mygid %5d   mysid %5d   nnz_loc " IFMT "  sum_loc  %e lsum_loc   %e nnz " IFMT " nnzLU %ld sum %e  lsum %e  N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol);
+	fflush(stdout);
+#endif
+
+#if 0
+
+// #ifdef GPU_PROF
+
+//  if(!iam )
+//  {
+//      char* ttemp;
+
+//      ttemp = getenv("IO_FILE");
+//      if(ttemp!=NULL)
+//      {
+//          printf("File being opend is %s\n",ttemp );
+//          FILE* fp;
+//          fp = fopen(ttemp,"w");
+//          if(!fp)
+//          {
+//              fprintf(stderr," Couldn't open output file %s\n",ttemp);
+//          }
+
+//          int nsup=Glu_persist->supno[n-1]+1;
+//          int ii;
+//          for (ii = 0; ii < nsup; ++ii)
+//          {
+//                  fprintf(fp,"%d,%d,%d,%d,%d,%d\n",gs1.mnk_min_stats[ii],gs1.mnk_min_stats[ii+nsup],
+//                  gs1.mnk_min_stats[ii+2*nsup],
+//                  gs1.mnk_max_stats[ii],gs1.mnk_max_stats[ii+nsup],gs1.mnk_max_stats[ii+2*nsup]);
+//          }
+
+//          // lastly put the timeing stats that we need
+
+//          fprintf(fp,"Min %lf Max %lf totaltime %lf \n",gs1.osDgemmMin,gs1.osDgemmMax,stat->utime[FACT]);
+//          fclose(fp);
+//      }
+
+//  }
+// #endif
+
+#endif
+
+	if ( options->PrintStat ) {
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+
+	    sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
+	    if (parSymbFact == TRUE) {
+	        /* The memory used in the redistribution routine
+		   includes the memory used for storing the symbolic
+  		   structure and the memory allocated for numerical
+		   factorization */
+	        temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use);
+                if ( options->RowPerm != NO )
+                    temp = SUPERLU_MAX(temp, GA_mem_use);
+            } else {
+	        temp = SUPERLU_MAX (
+                         symb_mem_usage.total + GA_mem_use, /* symbfact step */
+		         symb_mem_usage.for_lu + dist_mem_use +
+                             num_mem_usage.for_lu  /* distribution step */
+                       );
+            }
+
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+
+            if (!iam) {
+		printf("\n** Memory Usage **********************************\n");
+                printf("** NUMfact space (MB): (sum-of-all-processes)\n"
+		       "    L\\U :        %8.2f |  Total : %8.2f\n",
+		       for_lu * 1e-6, total * 1e-6);
+                printf("** Total highmark (MB):\n"
+		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+		       avg * 1e-6,
+		       avg / grid->nprow / grid->npcol * 1e-6,
+		       max * 1e-6);
+		printf("**************************************************\n\n");
+		printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots);
+		fflush(stdout);
+            }
+	} /* end printing stats */
+
+    } /* end if (!factored) */
+
+
+    if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+	/* Need to reset the solve's communication pattern,
+	   because perm_r[] and/or perm_c[] is changed.    */
+	if ( options->SolveInitialized == YES ) { /* Initialized before */
+	    sSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+	    options->SolveInitialized = NO;   /* Reset the solve state */
+	}
+     }
+#if 0
+    /* Need to revisit: Why the following is not good enough for X-to-B
+       distribution -- inv_perm_c changed */
+	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+	psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
+	             LUstruct->Glu_persist, SOLVEstruct);
+#endif
+
+
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = floatMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed.
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+		        b_col[i] *= R[irow];
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    b_col[i] *= C[irow];
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+#if 0 /* Sherry */
+	    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+#endif
+            memcpy(x_col, b_col, m_loc * sizeof(float));
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	if ( options->SolveInitialized == NO ) { /* First time */
+	    sSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid,
+		       SOLVEstruct);
+            /* Inside this routine, SolveInitialized is set to YES.
+	       For repeated call to psgssvx(), no need to re-initialilze
+	       the Solve data & communication structures, unless a new
+	       factorization with Fact == DOFACT or SamePattern is asked for. */
+	}
+
+	if ( options->DiagInv==YES &&
+             (options->SolveInitialized == NO || Fact == SamePattern ||
+              Fact == SamePattern_SameRowPerm) ) {
+	    psCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+	}
+
+
+    // #pragma omp parallel
+    // {
+	// #pragma omp master
+	// {
+	psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+		fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+	// }
+	// }
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    int_t *it;
+            int_t *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+	          /* This was allocated and set to NULL in sSolveInit() */
+	    sSOLVEstruct_t *SOLVEstruct1;  /* Used by refinement. */
+
+	    t = SuperLU_timer_();
+	    if ( options->RefineInitialized == NO || Fact == DOFACT ) {
+	        /* All these cases need to re-initialize gsmv structure */
+	        if ( options->RefineInitialized )
+		    psgsmv_finalize(SOLVEstruct->gsmv_comm);
+	        psgsmv_init(A, SOLVEstruct->row_to_proc, grid,
+			    SOLVEstruct->gsmv_comm);
+
+                /* Save a copy of the transformed local col indices
+		   in colind_gsmv[]. */
+	        if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv);
+	        if ( !(it = intMalloc_dist(nnz_loc)) )
+		    ABORT("Malloc fails for colind_gsmv[]");
+	        colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+	        for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+	        options->RefineInitialized = YES;
+	    } else if ( Fact == SamePattern ||
+			Fact == SamePattern_SameRowPerm ) {
+	        float atemp;
+	        int_t k, jcol, p;
+	        /* Swap to beginning the part of A corresponding to the
+		   local part of X, as was done in psgsmv_init() */
+	        for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+		    k = rowptr[i];
+		    for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+		        jcol = colind[j];
+		        p = SOLVEstruct->row_to_proc[jcol];
+		        if ( p == iam ) { /* Local */
+		            atemp = a[k]; a[k] = a[j]; a[j] = atemp;
+		            ++k;
+		        }
+		    }
+	        }
+
+	        /* Re-use the local col indices of A obtained from the
+		   previous call to psgsmv_init() */
+	        for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i];
+	    }
+
+	    if ( nrhs == 1 ) { /* Use the existing solve structure */
+	        SOLVEstruct1 = SOLVEstruct;
+	    } else { /* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS.
+			So we use SOLVEstruct1 for the refinement step.
+		     */
+	        if ( !(SOLVEstruct1 = (sSOLVEstruct_t *)
+		                       SUPERLU_MALLOC(sizeof(sSOLVEstruct_t))) )
+		    ABORT("Malloc fails for SOLVEstruct1");
+	        /* Copy the same stuff */
+	        SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+	        SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+	        SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+	        SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+	        SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+	        SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+	        SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+
+		/* Initialize the *gstrs_comm for 1 RHS. */
+		if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+		       SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+		    ABORT("Malloc fails for gstrs_comm[]");
+		psgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+			     Glu_persist, SOLVEstruct1);
+	    }
+
+	    psgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid,
+		    B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+            /* Deallocate the storage associated with SOLVEstruct1 */
+	    if ( nrhs > 1 ) {
+	        pxgstrs_finalize(SOLVEstruct1->gstrs_comm);
+	        SUPERLU_FREE(SOLVEstruct1);
+	    }
+
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	} /* end if IterRefine */
+
+	/* Permute the solution matrix B <= Pc'*X. */
+	psPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc,
+			       SOLVEstruct->inv_perm_c,
+			       X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+	printf("\n (%d) .. After psPermute_Dense_Matrix(): b =\n", iam);
+	for (i = 0; i < m_loc; ++i)
+	  printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]);
+#endif
+
+	/* Transform the solution matrix X to a solution of the original
+	   system before equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    irow = fst_row;
+		    for (i = 0; i < m_loc; ++i) {
+		        b_col[i] *= C[irow];
+		        ++irow;
+		    }
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+	        irow = fst_row;
+		for (i = 0; i < m_loc; ++i) {
+		    b_col[i] *= R[irow];
+		    ++irow;
+		}
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 && *info == 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it was not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW:
+		SUPERLU_FREE(C);
+		break;
+	    case COL:
+		SUPERLU_FREE(R);
+		break;
+	    default: break;
+	}
+    }
+
+#if 0
+    if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+ 	Destroy_CompCol_Permuted_dist(&GAC);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgssvx()");
+#endif
+
+}
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
new file mode 100644
index 00000000..80bdd329
--- /dev/null
+++ b/SRC/psgssvx3d.c
@@ -0,0 +1,1589 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PSGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * </pre>
+ */
+
+void
+psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
+           sScalePermstruct_t * ScalePermstruct,
+           float B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
+           sLUstruct_t * LUstruct, sSOLVEstruct_t * SOLVEstruct,
+           float *berr, SuperLUStat_t * stat, int *info)
+{
+    NRformat_loc *Astore = A->Store;
+    SuperMatrix GA;        /* Global A in NC format */
+    NCformat *GAstore;
+    float *a_GA;
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+    /* The nonzero structures of L and U factors, which are
+       replicated on all processrs.
+       (lsub, xlsub) contains the compressed subscript of
+       supernodes in L.
+       (usub, xusub) contains the compressed subscript of
+       nonzero segments in U.
+       If options->Fact != SamePattern_SameRowPerm, they are
+       computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+       routine. They will be freed after PDDISTRIBUTE routine.
+       If options->Fact == SamePattern_SameRowPerm, these
+       structures are not used.                                  */
+    yes_no_t parSymbFact = options->ParSymbFact;
+    fact_t Fact;
+    float *a;
+    int_t *colptr, *rowind;
+    int_t *perm_r;              /* row permutations from partial pivoting */
+    int_t *perm_c;              /* column permutation vector */
+    int_t *etree;               /* elimination tree */
+    int_t *rowptr, *colind;     /* Local A in NR */
+    int_t colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t nnz_loc, m_loc, fst_row, icol;
+    int iam;
+    int ldx;                    /* LDA for matrix X (local). */
+    char equed[1], norm[1];
+    float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    float *X, *b_col, *b_work, *x_col;
+    double t;
+    float GA_mem_use;           /* memory usage by global A */
+    float dist_mem_use;         /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double dmin, dsum, dprod;
+#endif
+
+    LUstruct->dt = 's';
+    
+    // get the 2d grid
+    gridinfo_t *grid  = &(grid3d->grid2d);
+    iam = grid->iam;
+    
+    /* Test the options choices. */
+    *info = 0;
+    Fact = options->Fact;
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA) {
+	*info = -1;
+        fprintf (stderr,
+	         "Extra precise iterative refinement yet to support.");
+    } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_S || A->Mtype != SLU_GE)
+	 *info = -2;
+    else if (ldb < Astore->m_loc)
+         *info = -5;
+    else if (nrhs < 0) {
+	 *info = -6;
+    }
+    if (*info) {
+	i = -(*info);
+	pxerr_dist ("psgssvx3d", grid, -(*info));
+	return;
+    }
+    
+    /* Initialization. */
+
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d for the following 2D solve;
+    */
+    sGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (float *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+//////    
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter psgssvx3d()");
+#endif
+	
+    /* Perform preprocessing steps on process layer zero, including:
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
+
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
+    {
+        m = A->nrow;
+    	n = A->ncol;
+	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
+
+	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+	A->Store = Astore0;  // on 2D grid-0
+	ldb = Astore0->m_loc;
+	
+	/* The following code now works on 2D grid-0 */
+    	Astore = (NRformat_loc *) A->Store;
+    	nnz_loc = Astore->nnz_loc;
+    	m_loc = Astore->m_loc;
+    	fst_row = Astore->fst_row;
+    	a = (float *) Astore->nzval;
+    	rowptr = Astore->rowptr;
+    	colind = Astore->colind;
+
+        /* Structures needed for parallel symbolic factorization */
+    	int_t *sizes, *fstVtxSep;
+	int noDomains, nprocs_num;
+    	MPI_Comm symb_comm;  /* communicator for symbolic factorization */
+    	int col, key; /* parameters for creating a new communicator */
+    	Pslu_freeable_t Pslu_freeable;
+    	float flinfo;
+    
+	sizes = NULL;
+	fstVtxSep = NULL;
+	symb_comm = MPI_COMM_NULL;
+	
+	Equil = (!factored && options->Equil == YES);
+	notran = (options->Trans == NOTRANS);
+	
+	iam = grid->iam;
+	job = 5;
+	/* Extract equilibration status from a previous factorization */
+	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+	    {
+		rowequ = (ScalePermstruct->DiagScale == ROW) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+		colequ = (ScalePermstruct->DiagScale == COL) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+	    }
+	else {
+	    rowequ = colequ = FALSE;
+	}
+	
+	/* The following arrays are replicated on all processes. */
+	perm_r = ScalePermstruct->perm_r;
+	perm_c = ScalePermstruct->perm_c;
+	etree = LUstruct->etree;
+	R = ScalePermstruct->R;
+	C = ScalePermstruct->C;
+	/********/
+	
+	/* Not factored & ask for equilibration */
+	if (Equil && Fact != SamePattern_SameRowPerm) {
+	    /* Allocate storage if not done so before. */
+	    switch (ScalePermstruct->DiagScale)	{
+		case NOEQUIL:
+		    if (!(R = (float  *) floatMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    if (!(C = (float  *) floatMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->R = R;
+		    ScalePermstruct->C = C;
+		    break;
+		case ROW:
+		    if (!(C = (float  *) floatMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->C = C;
+		    break;
+		case COL:
+		    if (!(R = (float *) floatMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    ScalePermstruct->R = R;
+		    break;
+	        default: break;
+	    }
+	}
+	
+	/* ------------------------------------------------------------
+	   Diagonal scaling to equilibrate the matrix.
+	   ------------------------------------------------------------ */
+	if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Enter equil");
+#endif
+	    t = SuperLU_timer_ ();
+		
+	    if (Fact == SamePattern_SameRowPerm) {
+		/* Reuse R and C. */
+		switch (ScalePermstruct->DiagScale) {
+		case NOEQUIL:
+		    break;
+		case ROW:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j) {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+			    a[i] *= R[irow];    /* Scale rows. */
+			}
+			++irow;
+		    }
+		    break;
+		case COL:
+		    for (j = 0; j < m_loc; ++j)
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+			    icol = colind[i];
+			    a[i] *= C[icol];          /* Scale columns. */
+			}
+		    break;
+		case BOTH:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j)
+		    {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
+			    {
+				icol = colind[i];
+			        a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
+			    }
+			++irow;
+		    }
+		    break;
+		}
+	    } else { /* Compute R & C from scratch */
+		/* Compute the row and column scalings. */
+		psgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+		
+		if ( iinfo > 0 ) {
+		    if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		    } else {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+		    }
+		} else if ( iinfo < 0 ) return;
+
+		/* Now iinfo == 0 */
+
+		/* Equilibrate matrix A if it is badly-scaled.
+		   A <-- diag(R)*A*diag(C)                     */
+		pslaqgs (A, R, C, rowcnd, colcnd, amax, equed);
+		
+		if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+		} else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+		} else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+		} else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+		if (iam==0) {
+		    printf (".. equilibrated? *equed = %c\n", *equed);
+		    fflush(stdout);
+		}
+#endif
+	    } /* end if-else Fact ... */
+
+	    stat->utime[EQUIL] = SuperLU_timer_ () - t;
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Exit equil");
+#endif
+	} /* end if Equil ... LAPACK style, not involving MC64 */
+
+	if ( !factored ) { /* Skip this if already factored. */
+	    /*
+	     * Gather A from the distributed compressed row format to
+	     * global A in compressed column format.
+	     * Numerical values are gathered only when a row permutation
+	     * for large diagonal is sought after.
+	     */
+	    if (Fact != SamePattern_SameRowPerm &&
+		(parSymbFact == NO || options->RowPerm != NO)) {
+		    
+		need_value = (options->RowPerm == LargeDiag_MC64);
+		
+		psCompRow_loc_to_CompCol_global (need_value, A, grid, &GA);
+		
+		GAstore = (NCformat *) GA.Store;
+		colptr = GAstore->colptr;
+		rowind = GAstore->rowind;
+		nnz = GAstore->nnz;
+		GA_mem_use = (nnz + n + 1) * sizeof (int_t);
+		
+		if (need_value) {
+		    a_GA = (float *) GAstore->nzval;
+		    GA_mem_use += nnz * sizeof (float);
+		}
+
+		else
+		    assert (GAstore->nzval == NULL);
+	    }
+	    
+	    /* ------------------------------------------------------------
+	       Find the row permutation for A.
+	       ------------------------------------------------------------ */
+	    if (options->RowPerm != NO) {
+		t = SuperLU_timer_ ();
+		if (Fact != SamePattern_SameRowPerm) {
+		    if (options->RowPerm == MY_PERMR) {
+			/* Use user's perm_r. */
+			/* Permute the global matrix GA for symbfact() */
+			for (i = 0; i < colptr[n]; ++i) {
+			    irow = rowind[i];
+			    rowind[i] = perm_r[irow];
+			}
+		    } else if ( options->RowPerm == LargeDiag_MC64 ) {
+			/* Get a new perm_r[] */
+			if (job == 5) {
+			    /* Allocate storage for scaling factors. */
+			    if (!(R1 = floatMalloc_dist (m)))
+				ABORT ("SUPERLU_MALLOC fails for R1[]");
+			    if (!(C1 = floatMalloc_dist (n)))
+				ABORT ("SUPERLU_MALLOC fails for C1[]");
+			}
+			
+			if ( iam==0 ) {
+			    /* Process 0 finds a row permutation */
+			    iinfo = sldperm_dist (job, m, nnz, colptr, rowind, a_GA,
+						  perm_r, R1, C1);
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm);
+				}
+			    }
+			} else {
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm);
+				}
+			    }
+			}
+
+			if ( iinfo && job == 5) { /* Error return */
+			    SUPERLU_FREE(R1);
+			    SUPERLU_FREE(C1);
+			}
+#if ( PRNTlevel>=2 )
+			dmin = damch_dist ("Overflow");
+			dsum = 0.0;
+			dprod = 1.0;
+#endif
+			if ( iinfo == 0 ) {
+			    if (job == 5) {
+				if ( Equil ) {
+				    for (i = 0; i < n; ++i) {
+					R1[i] = exp (R1[i]);
+					C1[i] = exp (C1[i]);
+				    }
+					
+				    /* Scale the distributed matrix further. 
+				       A <-- diag(R1)*A*diag(C1)            */
+				    irow = fst_row;
+				    for (j = 0; j < m_loc; ++j) {
+					for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+					    icol = colind[i];
+			                    a[i] *= R1[irow] * C1[icol];
+#if ( PRNTlevel>=2 )
+					    if (perm_r[irow] == icol) {
+						/* New diagonal */
+						if (job == 2 || job == 3)
+				                  dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+						else if (job == 4)
+				                  dsum += fabs(a[i]);
+						else if (job == 5)
+				                  dprod *= fabs(a[i]);
+					    }
+#endif
+					}
+					++irow;
+				    }
+
+				    /* Multiply together the scaling factors --
+				       R/C from simple scheme, R1/C1 from MC64. */
+				    if (rowequ)
+					for (i = 0; i < m; ++i) R[i] *= R1[i];
+				    else
+					for (i = 0; i < m; ++i) R[i] = R1[i];
+				    if (colequ)
+					for (i = 0; i < n; ++i) C[i] *= C1[i];
+				    else
+					for (i = 0; i < n; ++i) C[i] = C1[i];
+
+				    ScalePermstruct->DiagScale = BOTH;
+				    rowequ = colequ = 1;
+
+				}  /* end if Equil */
+
+				/* Now permute global A to prepare for symbfact() */
+				for (j = 0; j < n; ++j) {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }
+				}
+				SUPERLU_FREE (R1);
+				SUPERLU_FREE (C1);
+			    } else { /* job = 2,3,4 */
+				for (j = 0; j < n; ++j)  {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i)
+				    {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }   /* end for i ... */
+				}       /* end for j ... */
+			    }           /* end else job ... */
+			} else { /* if iinfo != 0 */
+			    for (i = 0; i < m; ++i) perm_r[i] = i;
+			}
+#if ( PRNTlevel>=2 )
+			if (job == 2 || job == 3) {
+			    if (!iam)
+				printf ("\tsmallest diagonal %e\n", dmin);
+			} else if (job == 4) {
+			    if (!iam)
+				printf ("\tsum of diagonal %e\n", dsum);
+			} else if (job == 5) {
+			    if (!iam)
+				printf ("\t product of diagonal %e\n", dprod);
+			}
+#endif
+		    } else { /* use LargeDiag_HWPM */
+#ifdef HAVE_COMBBLAS
+		        s_c2cpp_GetHWPM(A, grid, ScalePermstruct);
+#else
+			if ( iam == 0 ) {
+			    printf("CombBLAS is not available\n"); fflush(stdout);
+			}
+#endif
+		    } /* end if-else options->RowPerm ... */
+
+		    t = SuperLU_timer_ () - t;
+		    stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+			printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+			fflush(stdout);
+		    }
+#endif
+		} /* end if Fact not SamePattern_SameRowPerm ... */
+	    } else { /* options->RowPerm == NOROWPERM / NATURAL */
+		for (i = 0; i < m; ++i)	perm_r[i] = i;
+	    }
+	    
+#if ( DEBUGlevel>=2 )
+	    if (!iam)
+		PrintInt10 ("perm_r", m, perm_r);
+#endif
+	} /* end if (!factored) */
+
+	if ( !factored || options->IterRefine ) {
+	    /* Compute norm(A), which will be used to adjust small diagonal. */
+	    if (notran)
+		*(unsigned char *) norm = '1';
+	    else
+		*(unsigned char *) norm = 'I';
+	    anorm = pslangs (norm, A, grid);
+#if ( PRNTlevel>=1 )
+	    if (!iam) {
+		printf (".. anorm %e\n", anorm); fflush(stdout);
+	    }
+#endif
+	}
+	
+	
+	/* ------------------------------------------------------------
+	   Perform the LU factorization.
+	   ------------------------------------------------------------ */
+	if ( !factored ) {
+	    t = SuperLU_timer_ ();
+	    /*
+	     * Get column permutation vector perm_c[], according to permc_spec:
+	     *   permc_spec = NATURAL:  natural ordering
+	     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	     *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	     *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	     */
+	    permc_spec = options->ColPerm;
+
+	    if (parSymbFact == YES || permc_spec == PARMETIS) {
+		nprocs_num = grid->nprow * grid->npcol;
+		noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num))));
+		
+		/* create a new communicator for the first noDomains
+		   processes in grid->comm */
+		key = iam;
+		if (iam < noDomains)
+		    col = 0;
+		else
+		    col = MPI_UNDEFINED;
+		MPI_Comm_split (grid->comm, col, key, &symb_comm);
+		
+		if (permc_spec == NATURAL || permc_spec == MY_PERMC) {
+		    if (permc_spec == NATURAL)
+			{
+			    for (j = 0; j < n; ++j)
+				perm_c[j] = j;
+			}
+		    if (!(sizes = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for sizes.");
+		    if (!(fstVtxSep = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for fstVtxSep.");
+		    for (i = 0; i < 2 * noDomains - 2; ++i) {
+			sizes[i] = 0;
+			fstVtxSep[i] = 0;
+		    }
+		    sizes[2 * noDomains - 2] = m;
+		    fstVtxSep[2 * noDomains - 2] = 0;
+		} else if (permc_spec != PARMETIS) {
+		    /* same as before */
+		    printf("{%4d,%4d}: psgssvx3d: invalid ColPerm option when ParSymbfact is used\n",
+			 (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid));
+		}
+	    } /* end ... use parmetis */
+
+	    if (permc_spec != MY_PERMC && Fact == DOFACT) {
+		if (permc_spec == PARMETIS) {
+	        /* Get column permutation vector in perm_c.                   *
+		 * This routine takes as input the distributed input matrix A *
+		 * and does not modify it.  It also allocates memory for      *
+		 * sizes[] and fstVtxSep[] arrays, that contain information   *
+		 * on the separator tree computed by ParMETIS.                */
+		    flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num,
+						  noDomains, &sizes, &fstVtxSep,
+						  grid, &symb_comm);
+		    if (flinfo > 0)
+			ABORT ("ERROR in get perm_c parmetis.");
+		} else {
+		    get_perm_c_dist (iam, permc_spec, &GA, perm_c);
+		}
+	    }
+
+	    stat->utime[COLPERM] = SuperLU_timer_ () - t;
+	    
+	    /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	       (a.k.a. column etree), depending on the choice of ColPerm.
+	       Adjust perm_c[] to be consistent with a postorder of etree.
+	       Permute columns of A to form A*Pc'. */
+	    if (Fact != SamePattern_SameRowPerm) {
+		if (parSymbFact == NO) {
+			    
+		    int_t *GACcolbeg, *GACcolend, *GACrowind;
+		    
+		    sp_colorder (options, &GA, perm_c, etree, &GAC);
+		    
+		    /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
+		    GACstore = (NCPformat *) GAC.Store;
+		    GACcolbeg = GACstore->colbeg;
+		    GACcolend = GACstore->colend;
+		    GACrowind = GACstore->rowind;
+		    for (j = 0; j < n; ++j) {
+			for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+			    irow = GACrowind[i];
+			    GACrowind[i] = perm_c[irow];
+			}
+		    }
+		    
+		    /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
+		       the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 )
+		    if (!iam)
+			printf
+			    (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
+			     sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+		    t = SuperLU_timer_ ();
+		    if (!(Glu_freeable = (Glu_freeable_t *)
+			  SUPERLU_MALLOC (sizeof (Glu_freeable_t))))
+			ABORT ("Malloc fails for Glu_freeable.");
+
+		    /* Every process does this. */
+		    iinfo = symbfact (options, iam, &GAC, perm_c, etree,
+				      Glu_persist, Glu_freeable);
+
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (iinfo < 0) {
+			/* Successful return */
+			QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage);
+
+#if ( PRNTlevel>=1 )
+			if (!iam) {
+			    printf ("\tNo of supers %ld\n",
+				    (long) Glu_persist->supno[n - 1] + 1);
+			    printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]);
+			    printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]);
+			    printf ("\tint %lu, short %lu, float %lu, double %lu\n",
+				    sizeof(int_t), sizeof (short),
+				    sizeof(float), sizeof (double));
+			    printf
+				("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+				 symb_mem_usage.for_lu * 1e-6,
+				 symb_mem_usage.total * 1e-6,
+				 symb_mem_usage.expansions);
+			}
+#endif
+		    } else {
+			if (!iam) {
+			    fprintf (stderr, "symbfact() error returns %d\n",
+				     (int) iinfo);
+			    exit (-1);
+			}
+		    } 
+
+		} /* end serial symbolic factorization */
+		else { /* parallel symbolic factorization */
+		    t = SuperLU_timer_ ();
+		    flinfo =
+			symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable,
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage);
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (flinfo > 0)
+			ABORT
+			    ("Insufficient memory for parallel symbolic factorization.");
+		}
+
+		/* Destroy GA */
+		if (parSymbFact == NO || options->RowPerm != NO)
+		    Destroy_CompCol_Matrix_dist (&GA);
+		if (parSymbFact == NO)
+		    Destroy_CompCol_Permuted_dist (&GAC);
+		
+	    } /* end if Fact not SamePattern_SameRowPerm */
+
+	    if (sizes)
+		SUPERLU_FREE (sizes);
+	    if (fstVtxSep)
+		SUPERLU_FREE (fstVtxSep);
+	    if (symb_comm != MPI_COMM_NULL)
+		MPI_Comm_free (&symb_comm);
+	    
+	    if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		t = SuperLU_timer_ ();
+		dist_mem_use = psdistribute (Fact, n, A, ScalePermstruct,
+					     Glu_freeable, LUstruct, grid);
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+		
+		/* Deallocate storage used in symbolic factorization. */
+		if (Fact != SamePattern_SameRowPerm)
+		    {
+			iinfo = symbfact_SubFree (Glu_freeable);
+			SUPERLU_FREE (Glu_freeable);
+		    }
+	    } else {
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		t = SuperLU_timer_ ();
+		dist_mem_use = sdist_psymbtonum (Fact, n, A, ScalePermstruct,
+						 &Pslu_freeable, LUstruct, grid);
+		if (dist_mem_use > 0)
+		    ABORT ("Not enough memory available for dist_psymbtonum\n");
+
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+	    }
+
+	    /*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+	} /* end if not Factored */
+    } /* end if process layer 0 */
+
+    trf3Dpartition_t*  trf3Dpartition;
+
+    /* Perform numerical factorization in parallel on all process layers.*/
+    if ( !factored ) {
+
+	/* send the data across all the layers */
+	MPI_Bcast( &m, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &n, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &anorm, 1, MPI_FLOAT, 0,  grid3d->zscp.comm);
+	
+	/* send the LU structure to all the grids */
+	sp3dScatter(n, LUstruct, grid3d);
+
+	int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+	trf3Dpartition = sinitTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+
+	SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t));
+	SCT_init(SCT);
+	
+#if ( PRNTlevel>=1 )
+	if (grid3d->iam == 0) {
+	    printf("after 3D initialization.\n"); fflush(stdout);
+	}
+#endif
+
+	t = SuperLU_timer_ ();
+
+	/*factorize in grid 1*/
+	// if(grid3d->zscp.Iam)
+
+	psgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+		   grid3d, stat, info);
+	stat->utime[FACT] = SuperLU_timer_ () - t;
+	
+	double tgather = SuperLU_timer_();
+	
+	sgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT);
+
+	SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+	/*print stats for bottom grid*/
+	
+#if ( PRNTlevel>=1 )
+	if (!grid3d->zscp.Iam)
+	    {
+		SCT_print(grid, SCT);
+		SCT_print3D(grid3d, SCT);
+	    }
+	SCT_printComm3D(grid3d, SCT);
+
+	/*print memory usage*/
+	s3D_printMemUse( trf3Dpartition, LUstruct, grid3d );
+
+	/*print forest weight and costs*/
+	printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
+	/*reduces stat from all the layers*/
+#endif
+
+        sDestroy_trf3Dpartition(trf3Dpartition, grid3d);
+	SCT_free(SCT);
+
+    } /* end if not Factored ... factor on all process layers */
+    
+    if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
+	if (!factored) {
+	    if (options->PrintStat) {
+		int_t TinyPivots;
+		float for_lu, total, max, avg, temp;
+		
+		sQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage);
+		
+		if (parSymbFact == TRUE) {
+		    /* The memory used in the redistribution routine
+		       includes the memory used for storing the symbolic
+		       structure and the memory allocated for numerical factorization */
+		    temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use);
+		    if (options->RowPerm != NO)
+			temp = SUPERLU_MAX (temp, GA_mem_use);
+		}
+		else {
+		    temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use,  /* symbfact step */
+					symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */
+					);
+		}
+
+		temp = SUPERLU_MAX (temp, num_mem_usage.total);
+
+		MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			       MPI_SUM, grid->comm);
+		stat->TinyPivots = TinyPivots;
+
+		MPI_Reduce (&num_mem_usage.for_lu, &for_lu,
+			    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&num_mem_usage.total, &total,
+				            1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+
+		if (!iam) {
+		    printf("\tNUMfact space (MB) sum(procs):  L\\U\t%.2f\tall\t%.2f\n",
+			   for_lu * 1e-6, total * 1e-6);
+		    printf ("\tTotal highmark (MB):  "
+			    "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6,
+			    avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6);
+		    printf("**************************************************\n");
+		    fflush(stdout);
+		}
+	    }
+	    
+	}   /* end if not Factored */
+
+	/* ------------------------------------------------------------
+	   Compute the solution matrix X.
+	   ------------------------------------------------------------ */
+	if ( (nrhs > 0) && (*info == 0) ) {
+	    if (!(b_work = floatMalloc_dist (n)))
+	        ABORT ("Malloc fails for b_work[]");
+
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+	        {
+	    	    if (rowequ)
+		        {
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+			        {
+			    	    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
+				    {
+		                         b_col[i] *= R[irow];
+ 					 ++irow;
+				    }
+				    b_col += ldb;
+				}
+			}
+		    }
+		else if (colequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				{
+		                    b_col[i] *= C[irow];
+				    ++irow;
+				}
+				b_col += ldb;
+			    }
+		    }
+
+		/* Save a copy of the right-hand side. */
+		ldx = ldb;
+		if (!(X = floatMalloc_dist (((size_t) ldx) * nrhs)))
+		    ABORT ("Malloc fails for X[]");
+		x_col = X;
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+		    x_col += ldx;
+		    b_col += ldb;
+		}
+
+		/* ------------------------------------------------------
+		   Solve the linear system.
+		   ------------------------------------------------------*/
+		if (options->SolveInitialized == NO) /* First time */
+                   /* Inside this routine, SolveInitialized is set to YES.
+	              For repeated call to psgssvx3d(), no need to re-initialilze
+	              the Solve data & communication structures, unless a new
+	              factorization with Fact == DOFACT or SamePattern is asked for. */
+		    {
+			sSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
+			            grid, SOLVEstruct);
+		    }
+		stat->utime[SOLVE] = 0.0;
+#if 0 // Sherry: the following interface is needed by 3D trisolve.
+		psgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
+				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#else
+		psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+			fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#endif
+
+		/* ------------------------------------------------------------
+		   Use iterative refinement to improve the computed solution and
+		   compute error bounds and backward error estimates for it.
+		   ------------------------------------------------------------ */
+		if (options->IterRefine)
+		    {
+			/* Improve the solution by iterative refinement. */
+			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+			sSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+			t = SuperLU_timer_ ();
+			if (options->RefineInitialized == NO || Fact == DOFACT) {
+			    /* All these cases need to re-initialize gsmv structure */
+			    if (options->RefineInitialized)
+				psgsmv_finalize (SOLVEstruct->gsmv_comm);
+			    psgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+					 SOLVEstruct->gsmv_comm);
+
+			    /* Save a copy of the transformed local col indices
+			       in colind_gsmv[]. */
+			    if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+			    if (!(it = intMalloc_dist (nnz_loc)))
+				ABORT ("Malloc fails for colind_gsmv[]");
+			    colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+			    for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+			    options->RefineInitialized = YES;
+			}
+			else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+			    float at;
+			    int_t k, jcol, p;
+			    /* Swap to beginning the part of A corresponding to the
+			       local part of X, as was done in pdgsmv_init() */
+			    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+				k = rowptr[i];
+				for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+				    {
+					jcol = colind[j];
+					p = SOLVEstruct->row_to_proc[jcol];
+					if (p == iam)
+					    {	/* Local */
+						at = a[k];
+						a[k] = a[j];
+						a[j] = at;
+						++k;
+					    }
+				    }
+			    }
+			    
+			    /* Re-use the local col indices of A obtained from the
+			       previous call to pdgsmv_init() */
+			    for (i = 0; i < nnz_loc; ++i)
+				colind[i] = colind_gsmv[i];
+			}
+			
+			if (nrhs == 1)
+			    {	/* Use the existing solve structure */
+				SOLVEstruct1 = SOLVEstruct;
+			    }
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
+				if (!(SOLVEstruct1 = (sSOLVEstruct_t *)
+				      SUPERLU_MALLOC(sizeof(sSOLVEstruct_t))))
+				    ABORT ("Malloc fails for SOLVEstruct1");
+				/* Copy the same stuff */
+				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+				SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+				SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+				SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+				SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+				SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+				SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				
+				/* Initialize the *gstrs_comm for 1 RHS. */
+				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+				    ABORT ("Malloc fails for gstrs_comm[]");
+				psgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+					      Glu_persist, SOLVEstruct1);
+			    }
+			
+			psgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
+				 B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+			
+			/* Deallocate the storage associated with SOLVEstruct1 */
+			if (nrhs > 1)
+			    {
+				pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+				SUPERLU_FREE (SOLVEstruct1);
+			    }
+			
+			stat->utime[REFINE] = SuperLU_timer_ () - t;
+		    } /* end IterRefine */
+		
+		/* Permute the solution matrix B <= Pc'*X. */
+		psPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+#endif
+		
+		/* Transform the solution matrix X to a solution of the original
+		   system before the equilibration. */
+		if (notran)
+		    {
+			if (colequ)
+			    {
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				    {
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					    {
+						b_col[i] *= C[irow];
+						++irow;
+					    }
+					b_col += ldb;
+				    }
+			    }
+		    }
+		else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					b_col[i] *= R[irow];
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
+		
+		SUPERLU_FREE (b_work);
+		SUPERLU_FREE (X);
+		
+	    } /* end if nrhs > 0 and factor successful */
+	
+#if ( PRNTlevel>=1 )
+	if (!iam) {
+	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
+#endif
+	
+	/* Deallocate R and/or C if it was not used. */
+	if (Equil && Fact != SamePattern_SameRowPerm)
+	    {
+		switch (ScalePermstruct->DiagScale) {
+		    case NOEQUIL:
+			SUPERLU_FREE (R);
+			SUPERLU_FREE (C);
+			break;
+		    case ROW:
+			SUPERLU_FREE (C);
+			break;
+		    case COL:
+			SUPERLU_FREE (R);
+			break;
+	            default: break;
+		}
+	    }
+	
+#if 0
+	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+	    Destroy_CompCol_Permuted_dist (&GAC);
+#endif
+
+    } /* process layer 0 done solve */
+
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) sScatter_B3d(A3d, grid3d);
+
+    B = A3d->B3d; // B is now assigned back to B3d on return
+    A->Store = Astore3d; // restore Astore to 3D
+    
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC (iam, "Exit psgssvx3d()");
+#endif
+
+}
diff --git a/SRC/psgssvx_ABglobal.c b/SRC/psgssvx_ABglobal.c
new file mode 100644
index 00000000..f4d582a3
--- /dev/null
+++ b/SRC/psgssvx_ABglobal.c
@@ -0,0 +1,1112 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B,
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015   version 4.3
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * psgssvx_ABglobal solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ *
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has several options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *      -  A, the input matrix
+ *
+ *      as well as the following options, which are described in more
+ *      detail below:
+ *
+ *      -  options->Equil,   to specify how to scale the rows and columns
+ *                           of A to "equilibrate" it (to try to reduce its
+ *                           condition number and so improve the
+ *                           accuracy of the computed solution)
+ *
+ *      -  options->RowPerm, to specify how to permute the rows of A
+ *                           (typically to control numerical stability)
+ *
+ *      -  options->ColPerm, to specify how to permute the columns of A
+ *                           (typically to control fill-in and enhance
+ *                           parallelism during factorization)
+ *
+ *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                           pivots encountered during factorization
+ *                           (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *      -  ScalePermstruct,  modified to describe how the input matrix A
+ *                           was equilibrated and permuted:
+ *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                        columns of A were scaled
+ *         -  ScalePermstruct->R, array of row scale factors
+ *         -  ScalePermstruct->C, array of column scale factors
+ *         -  ScalePermstruct->perm_r, row permutation vector
+ *         -  ScalePermstruct->perm_c, column permutation vector
+ *
+ *            (part of ScalePermstruct may also need to be supplied on input,
+ *             depending on options->RowPerm and options->ColPerm as described
+ *             later).
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *                Pc*Pr*diag(R)*A*diag(C)
+ *             where
+ *                Pr and Pc are row and columns permutation matrices determined
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
+ *                  respectively, and
+ *                diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *      -  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
+ *               in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In this
+ *            case the algorithm saves time by reusing the previously computed
+ *            column permutation vector stored in ScalePermstruct->perm_c
+ *            and the "elimination tree" of A stored in LUstruct->etree.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->RowPerm
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->perm_c, the column permutation
+ *      -  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated and row permuted
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are ignored.
+ *      This is because the permutations from ScalePermstruct->perm_r and
+ *      ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
+ *                                     column scaled
+ *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                             if any
+ *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                  matrix
+ *      -  all of LUstruct, the previously computed information about L and U
+ *                (the actual numerical values of L and U stored in
+ *                 LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated
+ *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *      -  A, the unfactored matrix, only in the case that iterative refinement
+ *            is to be done (specifically A must be the output A from
+ *            the previous call, so that it has been scaled and permuted)
+ *      -  all of ScalePermstruct
+ *      -  all of LUstruct, including the actual numerical values of L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix*
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *         The number of linear equations is A->nrow. The type of A must be:
+ *         Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE. That is, A is stored in
+ *         compressed column format (also known as Harwell-Boeing format).
+ *         See supermatrix.h for the definition of 'SuperMatrix'.
+ *         This routine only handles square A, however, the LU factorization
+ *         routine psgstrf can factorize rectangular matrices.
+ *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
+ *         depending on ScalePermstruct->DiagScale, options->RowPerm and
+ *         options->colpem:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->RowPerm != NATURAL, A is further overwritten by
+ *                Pr*diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                Pc*Pr*diag(R)*A*diag(C).
+ *         If all the above condition are true, the LU decomposition is
+ *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ *         NOTE: Currently, A must reside in all processes when calling
+ *               this routine.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t*
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float*
+ *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
+ *         On exit, the solution matrix if info = 0;
+ *
+ *         NOTE: Currently, B must reside in all processes when calling
+ *               this routine.
+ *
+ * ldb     (input) int (global)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t*
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * berr    (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * </pre>
+ */
+void
+psgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
+		 sScalePermstruct_t *ScalePermstruct,
+		 float B[], int ldb, int nrhs, gridinfo_t *grid,
+		 sLUstruct_t *LUstruct, float *berr,
+		 SuperLUStat_t *stat, int *info)
+{
+    SuperMatrix AC;
+    NCformat *Astore;
+    NCPformat *ACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+            /* The nonzero structures of L and U factors, which are
+	       replicated on all processrs.
+	           (lsub, xlsub) contains the compressed subscript of
+		                 supernodes in L.
+          	   (usub, xusub) contains the compressed subscript of
+		                 nonzero segments in U.
+	      If options->Fact != SamePattern_SameRowPerm, they are
+	      computed by SYMBFACT routine, and then used by DDISTRIBUTE
+	      routine. They will be freed after DDISTRIBUTE routine.
+	      If options->Fact == SamePattern_SameRowPerm, these
+	      structures are not used.                                  */
+    fact_t   Fact;
+    float   *a;
+    int_t    *perm_r; /* row permutations from partial pivoting */
+    int_t    *perm_c; /* column permutation vector */
+    int_t    *etree;  /* elimination tree */
+    int_t    *colptr, *rowind;
+    int_t    Equil, factored, job, notran, colequ, rowequ;
+    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use;
+    int      iam;
+    int      ldx;  /* LDA for matrix X (global). */
+    char     equed[1], norm[1];
+    float   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    float   *X, *b_col, *b_work, *x_col;
+    double   t;
+    static superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double   dmin, dsum, dprod;
+#endif
+	LUstruct->dt = 's';
+
+    /* Test input parameters. */
+    *info = 0;
+    Fact = options->Fact;
+    if ( Fact < 0 || Fact > FACTORED )
+	*info = -1;
+    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+	*info = -1;
+    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+	*info = -1;
+    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+	*info = -1;
+    else if ( options->IterRefine == SLU_EXTRA ) {
+	*info = -1;
+	fprintf(stderr, "Extra precise iterative refinement yet to support.");
+    } else if ( A->nrow != A->ncol || A->nrow < 0 ||
+         A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( ldb < A->nrow )
+	*info = -5;
+    else if ( nrhs < 0 )
+	*info = -6;
+    if ( *info ) {
+	i = -(*info);
+	pxerr_dist("psgssvx_ABglobal", grid, -*info);
+	return;
+    }
+
+    /* Initialization */
+    factored = (Fact == FACTORED);
+    Equil = (!factored && options->Equil == YES);
+    notran = (options->Trans == NOTRANS);
+    iam = grid->iam;
+    job = 5;
+    m = A->nrow;
+    n = A->ncol;
+    Astore = A->Store;
+    nnz = Astore->nnz;
+    a = Astore->nzval;
+    colptr = Astore->colptr;
+    rowind = Astore->rowind;
+    if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) {
+	rowequ = (ScalePermstruct->DiagScale == ROW) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+	colequ = (ScalePermstruct->DiagScale == COL) ||
+	         (ScalePermstruct->DiagScale == BOTH);
+    } else rowequ = colequ = FALSE;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgssvx_ABglobal()");
+#endif
+
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    etree = LUstruct->etree;
+    R = ScalePermstruct->R;
+    C = ScalePermstruct->C;
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	/* Allocate storage if not done so before. */
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+		if ( !(R = (float *) floatMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+	        if ( !(C = (float *) floatMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->R = R;
+		ScalePermstruct->C = C;
+		break;
+	    case ROW:
+	        if ( !(C = (float *) floatMalloc_dist(n)) )
+		    ABORT("Malloc fails for C[].");
+		ScalePermstruct->C = C;
+		break;
+	    case COL:
+		if ( !(R = (float *) floatMalloc_dist(m)) )
+		    ABORT("Malloc fails for R[].");
+		ScalePermstruct->R = R;
+		break;
+	    default: break;
+	}
+    }
+
+    /* ------------------------------------------------------------
+       Diagonal scaling to equilibrate the matrix.
+       ------------------------------------------------------------*/
+    if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Enter equil");
+#endif
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm ) {
+	    /* Reuse R and C. */
+	    switch ( ScalePermstruct->DiagScale ) {
+	      case NOEQUIL:
+		break;
+	      case ROW:
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			a[i] *= R[irow];       /* Scale rows. */
+		    }
+		}
+		break;
+	      case COL:
+		for (j = 0; j < n; ++j)
+		    for (i = colptr[j]; i < colptr[j+1]; ++i)
+			a[i] *= C[j];          /* Scale columns. */
+		break;
+	      case BOTH:
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			a[i] *= R[irow] * C[j]; /* Scale rows and columns. */
+		    }
+		}
+	        break;
+	    }
+	} else {
+	    if ( !iam ) {
+		/* Compute row and column scalings to equilibrate matrix A. */
+		sgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo);
+
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		} else {
+		    if ( iinfo > 0 ) {
+			if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n",
+				    iinfo);
+#endif
+			} else {
+#if ( PRNTlevel>=1 )
+                            fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n",
+				     iinfo-n);
+#endif
+                        }
+		    }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( R,       m, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( C,       n, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm );
+		    MPI_Bcast( &amax,   1, MPI_DOUBLE, 0, grid->comm );
+		}
+	    }
+
+            if ( iinfo == 0 ) {
+	        /* Equilibrate matrix A. */
+	        slaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed);
+	        if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+	        } else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+	        } else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+	        } else ScalePermstruct->DiagScale = NOEQUIL;
+            }
+
+#if ( PRNTlevel>=1 )
+	    if ( !iam ) {
+		printf(".. equilibrated? *equed = %c\n", *equed);
+		/*fflush(stdout);*/
+	    }
+#endif
+	} /* if Fact ... */
+
+	stat->utime[EQUIL] = SuperLU_timer_() - t;
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC(iam, "Exit equil");
+#endif
+    } /* end if Equil ... */
+
+    /* ------------------------------------------------------------
+       Permute rows of A.
+       ------------------------------------------------------------*/
+    if ( options->RowPerm != NO ) {
+	t = SuperLU_timer_();
+
+	if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */
+	    || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */
+	    for (i = 0; i < colptr[n]; ++i) {
+		    irow = rowind[i];
+		    rowind[i] = perm_r[irow];
+	    }
+	} else if ( !factored ) {
+	    if ( job == 5 ) {
+		/* Allocate storage for scaling factors. */
+		if ( !(R1 = (float *) SUPERLU_MALLOC(m * sizeof(float))) )
+		    ABORT("SUPERLU_MALLOC fails for R1[]");
+		if ( !(C1 = (float *) SUPERLU_MALLOC(n * sizeof(float))) )
+		    ABORT("SUPERLU_MALLOC fails for C1[]");
+	    }
+
+	    if ( !iam ) {
+		/* Process 0 finds a row permutation for large diagonal. */
+		iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a,
+                                perm_r, R1, C1);
+
+                MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		    MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		    if ( job == 5 && Equil ) {
+		       MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm );
+		       MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm );
+		   }
+		}
+	    } else {
+		MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+		if ( iinfo == 0 ) {
+		   MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm );
+		   if ( job == 5 && Equil ) {
+		      MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm );
+		      MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm );
+		   }
+		}
+	    }
+
+	    if ( iinfo && job == 5) {
+	        SUPERLU_FREE(R1);
+	        SUPERLU_FREE(C1);
+   	    }
+
+#if ( PRNTlevel>=2 )
+	    dmin = smach_dist("Overflow");
+	    dsum = 0.0;
+	    dprod = 1.0;
+#endif
+	    if ( iinfo == 0 ) {
+	      if ( job == 5 ) {
+		if ( Equil ) {
+		    for (i = 0; i < n; ++i) {
+			R1[i] = exp(R1[i]);
+			C1[i] = exp(C1[i]);
+		    }
+		    for (j = 0; j < n; ++j) {
+			for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			    irow = rowind[i];
+			    a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */
+			    rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			    if ( rowind[i] == j ) /* New diagonal */
+				dprod *= fabs(a[i]);
+#endif
+			}
+		    }
+
+		    /* Multiply together the scaling factors. */
+		    if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i];
+		    else for (i = 0; i < m; ++i) R[i] = R1[i];
+		    if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i];
+		    else for (i = 0; i < n; ++i) C[i] = C1[i];
+
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = colequ = 1;
+		} else { /* No equilibration. */
+		    for (i = colptr[0]; i < colptr[n]; ++i) {
+		        irow = rowind[i];
+			rowind[i] = perm_r[irow];
+         	    }
+		}
+		SUPERLU_FREE (R1);
+		SUPERLU_FREE (C1);
+	      } else { /* job = 2,3,4 */
+		for (j = 0; j < n; ++j) {
+		    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+			irow = rowind[i];
+			rowind[i] = perm_r[irow];
+#if ( PRNTlevel>=2 )
+			if ( rowind[i] == j ) { /* New diagonal */
+			    if ( job == 2 || job == 3 )
+				dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+			    else if ( job == 4 )
+				dsum += fabs(a[i]);
+			    else if ( job == 5 )
+				dprod *= fabs(a[i]);
+			}
+#endif
+		    } /* end for i ... */
+		} /* end for j ... */
+              } /* end else */
+            } else { /* if iinfo != 0 */
+		for (i = 0; i < m; ++i) perm_r[i] = i;
+	    }
+
+#if ( PRNTlevel>=2 )
+	    if ( job == 2 || job == 3 ) {
+		if ( !iam ) printf("\tsmallest diagonal %e\n", dmin);
+	    } else if ( job == 4 ) {
+		if ( !iam ) printf("\tsum of diagonal %e\n", dsum);
+	    } else if ( job == 5 ) {
+		if ( !iam ) printf("\t product of diagonal %e\n", dprod);
+	    }
+#endif
+
+        } /* else !factored */
+
+	t = SuperLU_timer_() - t;
+	stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+#endif
+
+    } else { /* options->RowPerm == NOROWPERM */
+        for (i = 0; i < m; ++i) perm_r[i] = i;
+    }
+
+    if ( !factored || options->IterRefine ) {
+	/* Compute norm(A), which will be used to adjust small diagonal. */
+	if ( notran ) *(unsigned char *)norm = '1';
+	else *(unsigned char *)norm = 'I';
+	anorm = slangs_dist(norm, A);
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. anorm %e\n", anorm);
+#endif
+    }
+
+    /* ------------------------------------------------------------
+       Perform the LU factorization.
+       ------------------------------------------------------------*/
+    if ( !factored ) {
+	t = SuperLU_timer_();
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = NATURAL:  natural ordering
+	 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	 */
+	permc_spec = options->ColPerm;
+	if ( permc_spec != MY_PERMC && Fact == DOFACT )
+	    /* Use an ordering provided by SuperLU */
+	    get_perm_c_dist(iam, permc_spec, A, perm_c);
+
+	/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	   (a.k.a. column etree), depending on the choice of ColPerm.
+	   Adjust perm_c[] to be consistent with a postorder of etree.
+	   Permute columns of A to form A*Pc'. */
+	sp_colorder(options, A, perm_c, etree, &AC);
+
+	/* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */
+	ACstore = AC.Store;
+	for (j = 0; j < n; ++j)
+	    for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) {
+		irow = ACstore->rowind[i];
+		ACstore->rowind[i] = perm_c[irow];
+	    }
+	stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+	/* Perform a symbolic factorization on matrix A and set up the
+	   nonzero data structures which are suitable for supernodal GENP. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+#if ( PRNTlevel>=1 )
+	    if ( !iam )
+		printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
+		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+	    t = SuperLU_timer_();
+	    if ( !(Glu_freeable = (Glu_freeable_t *)
+		   SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
+		ABORT("Malloc fails for Glu_freeable.");
+
+	    iinfo = symbfact(options, iam, &AC, perm_c, etree,
+			     Glu_persist, Glu_freeable);
+
+	    stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+
+	    if ( iinfo <= 0 ) {
+		QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+#if ( PRNTlevel>=1 )
+		if ( !iam ) {
+		    printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
+		    printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
+		    printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
+		    printf("\tint %d, short %d, float %d, double %d\n",
+			   (int) sizeof(int_t), (int) sizeof(short),
+ 			   (int) sizeof(float), (int) sizeof(double));
+		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+			   symb_mem_usage.for_lu*1e-6,
+			   symb_mem_usage.total*1e-6,
+			   symb_mem_usage.expansions);
+		}
+#endif
+	    } else { /* symbfact out of memory */
+#if ( PRNTlevel>=1 )
+		if ( !iam )
+		    fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo);
+#endif
+                *info = iinfo;
+                return;
+	    }
+	}
+
+	/* Distribute the L and U factors onto the process grid. */
+	t = SuperLU_timer_();
+	dist_mem_use = sdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid);
+	stat->utime[DIST] = SuperLU_timer_() - t;
+
+	/* Deallocate storage used in symbolic factor. */
+	if ( Fact != SamePattern_SameRowPerm ) {
+	    iinfo = symbfact_SubFree(Glu_freeable);
+	    SUPERLU_FREE(Glu_freeable);
+	}
+
+	/* Perform numerical factorization in parallel. */
+	t = SuperLU_timer_();
+	psgstrf(options, m, n, anorm, LUstruct, grid, stat, info);
+	stat->utime[FACT] = SuperLU_timer_() - t;
+
+#if ( PRNTlevel>=1 )
+	{
+	    int_t TinyPivots;
+	    float for_lu, total, max, avg, temp;
+	    sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+	    MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Reduce( &num_mem_usage.total, &total,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    temp = SUPERLU_MAX(symb_mem_usage.total,
+			       symb_mem_usage.for_lu +
+			       (float)dist_mem_use + num_mem_usage.for_lu);
+	    temp = SUPERLU_MAX(temp, num_mem_usage.total);
+	    MPI_Reduce( &temp, &max,
+		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+	    MPI_Reduce( &temp, &avg,
+		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+	    MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			  MPI_SUM, grid->comm );
+	    stat->TinyPivots = TinyPivots;
+	    if ( !iam ) {
+		printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n",
+		       for_lu*1e-6, total*1e-6);
+		printf("\tAll space (MB):"
+		       "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n",
+		       avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6);
+		printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots);
+		printf(".. psgstrf INFO = %d\n", *info);
+	    }
+	}
+#endif
+
+    } else if ( options->IterRefine ) { /* options->Fact==FACTORED */
+	/* Permute columns of A to form A*Pc' using the existing perm_c.
+	 * NOTE: rows of A were previously permuted to Pc*A.
+	 */
+	sp_colorder(options, A, perm_c, NULL, &AC);
+    } /* if !factored ... */
+
+    /* ------------------------------------------------------------
+       Compute the solution matrix X.
+       ------------------------------------------------------------*/
+    if ( nrhs && *info == 0 ) {
+
+	if ( !(b_work = floatMalloc_dist(n)) )
+	    ABORT("Malloc fails for b_work[]");
+
+	/* ------------------------------------------------------------
+	   Scale the right-hand side if equilibration was performed.
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    if ( rowequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) b_col[i] *= R[i];
+		    b_col += ldb;
+		}
+	    }
+	} else if ( colequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) b_col[i] *= C[i];
+		b_col += ldb;
+	    }
+	}
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pr*B.
+	   ------------------------------------------------------------*/
+	if ( options->RowPerm != NO ) {
+	    if ( notran ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i];
+		    for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		    b_col += ldb;
+		}
+	    }
+	}
+
+
+	/* ------------------------------------------------------------
+	   Permute the right-hand side to form Pc*B.
+	   ------------------------------------------------------------*/
+	if ( notran ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i];
+		for (i = 0; i < m; ++i) b_col[i] = b_work[i];
+		b_col += ldb;
+	    }
+	}
+
+	/* Save a copy of the right-hand side. */
+	ldx = ldb;
+	if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) )
+	    ABORT("Malloc fails for X[]");
+	x_col = X;  b_col = B;
+	for (j = 0; j < nrhs; ++j) {
+	    for (i = 0; i < ldb; ++i) x_col[i] = b_col[i];
+	    x_col += ldx;  b_col += ldb;
+	}
+
+	/* ------------------------------------------------------------
+	   Solve the linear system.
+	   ------------------------------------------------------------*/
+	psgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info);
+
+	/* ------------------------------------------------------------
+	   Use iterative refinement to improve the computed solution and
+	   compute error bounds and backward error estimates for it.
+	   ------------------------------------------------------------*/
+	if ( options->IterRefine ) {
+	    /* Improve the solution by iterative refinement. */
+	    t = SuperLU_timer_();
+	    psgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb,
+			      X, ldx, nrhs, berr, stat, info);
+	    stat->utime[REFINE] = SuperLU_timer_() - t;
+	}
+
+	/* Permute the solution matrix X <= Pc'*X. */
+	for (j = 0; j < nrhs; j++) {
+	    b_col = &B[j*ldb];
+	    x_col = &X[j*ldx];
+	    for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]];
+	}
+
+	/* Transform the solution matrix X to a solution of the original system
+	   before the equilibration. */
+	if ( notran ) {
+	    if ( colequ ) {
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < n; ++i) b_col[i] *= C[i];
+		    b_col += ldb;
+		}
+	    }
+	} else if ( rowequ ) {
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < n; ++i) b_col[i] *= R[i];
+		b_col += ldb;
+	    }
+	}
+
+	SUPERLU_FREE(b_work);
+	SUPERLU_FREE(X);
+
+    } /* end if nrhs != 0 */
+
+#if ( PRNTlevel>=1 )
+    if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+#endif
+
+    /* Deallocate R and/or C if it is not used. */
+    if ( Equil && Fact != SamePattern_SameRowPerm ) {
+	switch ( ScalePermstruct->DiagScale ) {
+	    case NOEQUIL:
+	        SUPERLU_FREE(R);
+		SUPERLU_FREE(C);
+		break;
+	    case ROW:
+		SUPERLU_FREE(C);
+		break;
+	    case COL:
+		SUPERLU_FREE(R);
+		break;
+	    default:  break;
+	}
+    }
+    if ( !factored || (factored && options->IterRefine) )
+	Destroy_CompCol_Permuted_dist(&AC);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgssvx_ABglobal()");
+#endif
+}
+
diff --git a/SRC/psgstrf.c b/SRC/psgstrf.c
new file mode 100644
index 00000000..092db241
--- /dev/null
+++ b/SRC/psgstrf.c
@@ -0,0 +1,2004 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs LU factorization in parallel.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 1, 1999
+ *   Feburary 7, 2001  use MPI_Isend/MPI_Irecv
+ *   October 15, 2008  latency-reducing panel factorization
+ *   July    12, 2011  static scheduling and arbitrary look-ahead
+ *   March   13, 2013  change NTAGS to MPI_TAG_UB value
+ *   September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
+ *   December 31, 2015 rename xMACH to xMACH_DIST.
+ *   September 30, 2017 optimization for Intel Knights Landing (KNL) node .
+ *   June 1, 2018      add parallel AWPM pivoting; add back arrive_at_ublock()
+ *   February 8, 2019  version 6.1.1
+ *
+ * Sketch of the algorithm
+ *
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( psgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ *
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           psgstrf2(k), one column at a time
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j)
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i)
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+#ifdef GPU_ACC
+// #define NUM_GPU_STREAMS 16
+// #define NUM_GPU_STREAMS 16
+#include "gpublas_utils.h"
+#endif
+
+/* Various defininations     */
+/*
+    Name    : SUPERNODE_PROFILE
+    Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec
+    obtained,bandwidth achieved:
+    Overhead : Low
+*/
+// #define SUPERNODE_PROFILE
+
+/*
+    Name    :   BASELINE
+    Purpose : baseline to compare performance against
+    Overhead : NA : this won't be used for running experiments
+*/
+// #define BASELINE
+
+/*
+    Name    :   PHI_FRAMEWORK
+    Purpose : To simulate and test algorithm used for offloading Phi
+    Overhead : NA : this won't be used for running experiments
+*/
+#define PHI_FRAMEWORK
+
+#if 0
+#define CACHELINE 64  /* bytes, Xeon Phi KNL */
+#else
+#define CACHELINE 0  /* not worry about false sharing of different threads */
+#endif
+//#define GEMM_PADLEN 1
+#define GEMM_PADLEN 8
+
+#define PSGSTRF2 psgstrf2_trsm
+
+#ifdef ISORT
+extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2);
+extern void isort1 (int_t N, int_t * ARRAY);
+
+#else
+
+int
+superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
+
+/************************************************************************/
+
+#include "sscatter.c"
+
+/************************************************************************/
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t
+psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
+       sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int *info)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs1 = _cptofcd ("L", strlen ("L"));
+    _fcd ftcs2 = _cptofcd ("N", strlen ("N"));
+    _fcd ftcs3 = _cptofcd ("U", strlen ("U"));
+#endif
+    float zero = 0.0, alpha = 1.0, beta = 0.0;
+    int_t *xsup;
+    int_t *lsub, *lsub1, *usub, *Usub_buf;
+    int_t **Lsub_buf_2, **Usub_buf_2;
+    float **Lval_buf_2, **Uval_buf_2;          /* pointers to starts of bufs */
+    float *lusup, *lusup1, *uval, *Uval_buf;   /* pointer to current buf     */
+    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
+        lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
+        nlb, nub, nsupc, rel, rukp, il, iu;
+    int_t Pc, Pr;
+    int iam, kcol, krow, yourcol, mycol, myrow, pi, pj;
+    int j, k, lk, nsupers;  /* k - current panel to work on */
+    int k0;        /* counter of the next supernode to be factored */
+    int kk, kk0, kk1, kk2, jj0; /* panels in the look-ahead window */
+    int iukp0, rukp0, flag0, flag1;
+    int nsupr, nbrow, segsize;
+    int msg0, msg2;
+    int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr;
+    float **Unzval_br_ptr, **Lnzval_bc_ptr;
+    int_t *index;
+    float *nzval;
+    float *ucol;
+    int *indirect, *indirect2;
+    int_t *tempi;
+    float *tempu, *tempv, *tempr;
+    /*    float *tempv2d, *tempU2d;  Sherry */
+    int iinfo;
+    int *ToRecv, *ToSendD, **ToSendR;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    superlu_scope_t *scp;
+    float s_eps;
+    double thresh;
+    /*int full;*/
+    int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks;
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
+        *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l,
+        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
+    float edag_supno_l_bytes;
+#ifdef ISORT
+    int_t *iperm_u;
+#endif
+    int *msgcnt;   /* Count the size of the message xfer'd in each buffer:
+		    *     0 : transferred in Lsub_buf[]
+		    *     1 : transferred in Lval_buf[]
+		    *     2 : transferred in Usub_buf[]
+		    *     3 : transferred in Uval_buf[]
+		    */
+    int **msgcnts, **msgcntsU; /* counts in the look-ahead window */
+    int *factored;  /* factored[j] == 0 : L col panel j is factorized. */
+    int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+    int num_look_aheads, look_id;
+    int *look_ahead; /* global look_ahead table */
+    int_t *perm_c_supno, *iperm_c_supno;
+          /* perm_c_supno[k] = j means at the k-th step of elimination,
+	   * the j-th supernode is chosen. */
+    MPI_Request *recv_req, **recv_reqs, **send_reqs, **send_reqs_u,
+        **recv_reqs_u;
+    MPI_Request *send_req, *U_diag_blk_send_req = NULL;
+    MPI_Status status;
+    void *attr_val;
+    int flag;
+
+    /* The following variables are used to pad GEMM dimensions so that
+       each is a multiple of vector length (8 doubles for KNL)  */
+    int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN,
+        gemm_n_pad = GEMM_PADLEN;
+    int gemm_padding = 0;
+
+    int iword = sizeof (int_t);
+    int dword = sizeof (float);
+
+    /* For measuring load imbalence in omp threads */
+    double omp_load_imblc = 0.0;
+    double *omp_loop_time;
+
+    double schur_flop_timer     = 0.0;
+    double pdgstrf2_timer       = 0.0;
+    double pdgstrs2_timer       = 0.0;
+    double lookaheadupdatetimer = 0.0;
+    double InitTimer            = 0.0; /* including compute schedule, malloc */
+    double tt_start, tt_end;
+
+/* #if !defined( GPU_ACC ) */
+    /* Counters for memory operations and timings */
+    double scatter_mem_op_counter  = 0.0;
+    double scatter_mem_op_timer    = 0.0;
+    double scatterL_mem_op_counter = 0.0;
+    double scatterL_mem_op_timer   = 0.0;
+    double scatterU_mem_op_counter = 0.0;
+    double scatterU_mem_op_timer   = 0.0;
+
+    /* Counters for flops/gather/scatter and timings */
+    double GatherLTimer            = 0.0;
+    double LookAheadRowSepMOP      = 0.0;
+    double GatherUTimer             = 0.0;
+    double GatherMOP               = 0.0;
+    double LookAheadGEMMTimer      = 0.0;
+    double LookAheadGEMMFlOp       = 0.0;
+    double LookAheadScatterTimer   = 0.0;
+    double LookAheadScatterMOP     = 0.0;
+    double RemainGEMMTimer         = 0.0;
+    double RemainGEMM_flops        = 0.0;
+    double RemainScatterTimer      = 0.0;
+    double NetSchurUpTimer         = 0.0;
+    double schur_flop_counter      = 0.0;
+/* #endif */
+
+#if ( PRNTlevel>= 1)
+    /* count GEMM max dimensions */
+    int gemm_max_m = 0, gemm_max_n = 0, gemm_max_k = 0;
+#endif
+
+#if ( DEBUGlevel>=2 )
+    int_t num_copy = 0, num_update = 0;
+#endif
+#if ( PRNTlevel==3 )
+    int zero_msg = 0, total_msg = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    double comm_wait_time = 0.0;
+    /* Record GEMM dimensions and times */
+    FILE *fopen(), *fgemm;
+    int gemm_count = 0;
+    typedef struct {
+	int m, n, k;
+	double microseconds;
+    } gemm_profile;
+    gemm_profile *gemm_stats;
+#endif
+
+    /* Test the input parameters. */
+    *info = 0;
+    if (m < 0)
+        *info = -2;
+    else if (n < 0)
+        *info = -3;
+    if (*info) {
+        pxerr_dist ("psgstrf", grid, -*info);
+        return (-1);
+    }
+
+    /* Quick return if possible. */
+    if (m == 0 || n == 0) return 0;
+
+    double tt1 = SuperLU_timer_ ();
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    mycol = MYCOL (iam, grid);
+    nsupers = Glu_persist->supno[n - 1] + 1;
+    xsup = Glu_persist->xsup;
+    s_eps = smach_dist("Epsilon");
+    thresh = s_eps * anorm;
+
+    MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    if (!flag) {
+        fprintf (stderr, "Could not get TAG_UB\n");
+        return (-1);
+    }
+    int tag_ub = *(int *) attr_val;
+
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
+#if ( DEBUGlevel>=1 )
+    if (s_eps == 0.0)
+        printf (" ***** warning s_eps = %e *****\n", s_eps);
+    CHECK_MALLOC (iam, "Enter psgstrf()");
+#endif
+#if (PROFlevel >= 1 )
+    gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
+    if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w");
+    int *prof_sendR = intCalloc_dist(nsupers);
+#endif
+
+    stat->ops[FACT]      = 0.0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+
+    /* make sure the range of look-ahead window [0, MAX_LOOKAHEADS-1] */
+    num_look_aheads = SUPERLU_MAX(0, SUPERLU_MIN(options->num_lookaheads, MAX_LOOKAHEADS - 1));
+
+    if (Pr * Pc > 1) {
+        if (!(U_diag_blk_send_req =
+              (MPI_Request *) SUPERLU_MALLOC (Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for U_diag_blk_send_req[].");
+	/* flag no outstanding Isend */
+        U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; /* used 0 before */
+
+        /* allocating buffers for look-ahead */
+        i = Llu->bufmax[0];
+        if (i != 0) {
+            if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) )
+                ABORT ("Malloc fails for Lsub_buf.");
+	    tempi = Llu->Lsub_buf_2[0];
+            for (jj = 0; jj < num_look_aheads; jj++)
+		Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+	    //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[1];
+        if (i != 0) {
+            if (!(Llu->Lval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * ((size_t) i))))
+                ABORT ("Malloc fails for Lval_buf[].");
+	    tempr = Llu->Lval_buf_2[0];
+            for (jj = 0; jj < num_look_aheads; jj++)
+		Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[2];
+        if (i != 0) {
+            if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Usub_buf_2[].");
+	    tempi = Llu->Usub_buf_2[0];
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */
+                //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i;
+        }
+        i = Llu->bufmax[3];
+        if (i != 0) {
+            if (!(Llu->Uval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * i)))
+                ABORT ("Malloc fails for Uval_buf_2[].");
+	    tempr = Llu->Uval_buf_2[0];
+            for (jj = 0; jj < num_look_aheads; jj++)
+                Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */
+	    //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i;
+        }
+    }
+
+    log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1)
+		* iword +
+		(Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1)
+		* dword, stat );
+
+    /* creating pointers to the look-ahead buffers */
+    if (! (Lsub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Lsub_buf_2[].");
+    if (! (Lval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *))))
+        ABORT ("Malloc fails for Lval_buf_2[].");
+    if (! (Usub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *))))
+        ABORT ("Malloc fails for Uval_buf_2[].");
+    if (! (Uval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *))))
+        ABORT ("Malloc fails for buf_2[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        Lval_buf_2[i] = Llu->Lval_buf_2[i];
+        Lsub_buf_2[i] = Llu->Lsub_buf_2[i];
+        Uval_buf_2[i] = Llu->Uval_buf_2[i];
+        Usub_buf_2[i] = Llu->Usub_buf_2[i];
+    }
+
+    if (!(msgcnts = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcnts[].");
+    if (!(msgcntsU = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *))))
+        ABORT ("Malloc fails for msgcntsU[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(msgcnts[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcnts[].");
+        if (!(msgcntsU[i] = SUPERLU_MALLOC (4 * sizeof (int))))
+            ABORT ("Malloc fails for msgcntsU[].");
+    }
+
+    if (! (recv_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs_u[].");
+    if (! (send_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (send_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for send_reqs_u[].");
+    if (! (recv_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *))))
+        ABORT ("Malloc fails for recv_reqs[].");
+    for (i = 0; i <= num_look_aheads; i++) {
+        if (!(recv_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req_u[i].");
+        if (!(send_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pr * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_req_u[i].");
+        if (!(send_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pc * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for send_reqs[i].");
+        if (!(recv_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (4 * sizeof (MPI_Request))))
+            ABORT ("Malloc fails for recv_req[].");
+        send_reqs[i][0] = send_reqs[i][1] = MPI_REQUEST_NULL;
+        recv_reqs[i][0] = recv_reqs[i][1] = MPI_REQUEST_NULL;
+    }
+
+    if (!(factored = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factored[].");
+    if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t))))
+        ABORT ("Malloc fails for factoredU[].");
+    for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1;
+
+    log_memory(2 * nsupers * iword, stat);
+
+    int num_threads = 1;
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    #pragma omp master
+    {
+        num_threads = omp_get_num_threads ();
+    }
+#endif
+
+#if 0
+    omp_loop_time = (double *) _mm_malloc (sizeof (double) * num_threads,64);
+#else
+    omp_loop_time = (double *) SUPERLU_MALLOC(num_threads * sizeof(double));
+#endif
+
+#if ( PRNTlevel>=1 )
+    if(!iam) {
+       printf(".. Starting with %d OpenMP threads \n", num_threads );
+       fflush(stdout);
+    }
+#endif
+
+    nblocks = 0;
+    ncb = nsupers / Pc; /* number of column blocks, horizontal */
+    nrb = nsupers / Pr; /* number of row blocks, vertical  */
+
+    /* in order to have dynamic scheduling */
+    int *full_u_cols;
+    int *blk_ldu;
+#if 0
+    full_u_cols = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+    blk_ldu = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64);
+#else
+    full_u_cols = SUPERLU_MALLOC((ncb+1) * sizeof(int));
+    blk_ldu = SUPERLU_MALLOC((ncb+1) * sizeof(int)); // +1 to accommodate un-even division
+#endif
+
+    log_memory(2 * ncb * iword, stat);
+
+#if 0  /* Sherry: not used? */
+    /* This bunch is used for static scheduling */
+    pair *full_col_count = (pair *) _mm_malloc (sizeof (pair) * ncb,64);
+    int_t *count_cols, *sum_cols, *partition;
+    count_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    sum_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64);
+    partition = (int_t *) _mm_malloc (sizeof (int_t) * num_threads * ncb,64);
+    int_t ldp = ncb;
+#endif
+
+    /* ##################################################################
+     *  Compute a good static schedule based on the factorization task graph.
+     * ################################################################## */
+    perm_c_supno = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+    iperm_c_supno = perm_c_supno + nsupers;
+
+    sstatic_schedule(options, m, n, LUstruct, grid, stat,
+		    perm_c_supno, iperm_c_supno, info);
+
+#if ( DEBUGlevel >= 2 )
+    PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno);
+
+    /* Turn off static schedule */
+    printf("[%d] .. Turn off static schedule for debugging ..\n", iam);
+    for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i;
+#endif
+     /* ################################################################## */
+
+    /* constructing look-ahead table to indicate the last dependency */
+    int *look_ahead_l; /* Sherry: add comment on look_ahead_l[] */
+    stat->num_look_aheads = num_look_aheads;
+
+    look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
+    look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
+    for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */
+    log_memory(3 * nsupers * iword, stat);
+
+    /* Sherry: omp parallel?
+       not worth doing, due to concurrent write to look_ahead_l[jb] */
+    for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */
+        ib = lb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[lb];
+        if (index) { /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k]; /* global block number */
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+    if (myrow < nsupers % grid->nprow) { /* leftover block rows */
+        ib = nrb * Pr + myrow;
+        index = Llu->Ufstnz_br_ptr[nrb];
+        if (index) {             /* Not an empty row */
+            k = BR_HEADER;
+            for (j = 0; j < index[0]; ++j) {
+                jb = index[k];
+                if (jb != ib)
+                    look_ahead_l[jb] =
+                        SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                k += UB_DESCRIPTOR + SuperSize (index[k]);
+            }
+        }
+    }
+
+    if (options->SymPattern == NO) {
+	/* Sherry: omp parallel?
+	   not worth doing, due to concurrent write to look_ahead_l[jb] */
+        for (lb = 0; lb < ncb; lb++) { /* go through L-factor */
+            ib = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+        if (mycol < nsupers % grid->npcol) { /* leftover block columns */
+            ib = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {
+                k = BC_HEADER;
+                for (j = 0; j < index[0]; j++) {
+                    jb = index[k];
+                    if (jb != ib)
+                        look_ahead_l[jb] =
+                            SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+        }
+    }
+    MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, grid->comm);
+    SUPERLU_FREE (look_ahead_l);
+
+#ifdef ISORT
+    iperm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+    perm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+#else
+    perm_u = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t));
+#endif
+    log_memory(nsupers * iword, stat);
+
+    k = sp_ienv_dist (3);       /* max supernode size */
+#if 0
+    if ( !(Llu->ujrow = floatMalloc_dist(k*(k+1)/2)) )
+         ABORT("Malloc fails for ujrow[].");
+#else
+    /* Instead of half storage, we'll do full storage */
+    if (!(Llu->ujrow = floatCalloc_dist (k * k)))
+        ABORT ("Malloc fails for ujrow[].");
+#endif
+    log_memory(k * k * iword, stat);
+
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf (".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm,
+                thresh);
+        printf
+            (".. Buffer size: Lsub %ld\tLval %ld\tUsub %ld\tUval %ld\tLDA %ld\n",
+             (long int) Llu->bufmax[0], (long int) Llu->bufmax[1],
+             (long int) Llu->bufmax[2], (long int) Llu->bufmax[3],
+             (long int) Llu->bufmax[4]);
+        fflush(stdout);
+    }
+#endif
+
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    Unzval_br_ptr = Llu->Unzval_br_ptr;
+    ToRecv = Llu->ToRecv;
+    ToSendD = Llu->ToSendD;
+    ToSendR = Llu->ToSendR;
+
+    ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    k = CEILING (nsupers, Pr);  /* Number of local block rows */
+
+    /* Following code is for finding maximum row dimension of all L panels */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+#if 0
+#if defined _OPENMP  // Sherry: parallel reduction -- seems slower?
+#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub)
+#endif
+#endif
+    for (i = mycol; i < nsupers; i += Pc) { /* grab my local columns */
+        //int tpc = PCOL (i, grid);
+	lk = LBj (i, grid);
+	lsub = Lrowind_bc_ptr[lk];
+	if (lsub != NULL) {
+	    if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1];
+	}
+
+    }
+
+    /* Max row size is global reduction within a row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    /* Buffer size is max of look-ahead window */
+    /* int_t buffer_size =
+         SUPERLU_MAX (max_row_size * num_threads * ldt,
+                      get_max_buffer_size ());           */
+
+#ifdef GPU_ACC /*-------- use GPU --------*/
+    int gpublas_nb = get_gpublas_nb(); // default 64
+    int nstreams = get_num_gpu_streams (); // default 8
+
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
+                                     //   get_max_buffer_size());
+    /* array holding last column blk for each partition,
+       used in SchCompUdt-cuda.c         */
+  #if 0
+    int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
+  #else
+    int *stream_end_col = SUPERLU_MALLOC( nstreams * sizeof(int) );
+  #endif
+
+#else /* not to use GPU */
+
+    int Threads_per_process = get_thread_per_process();
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8));
+                                     // get_max_buffer_size());
+#endif /* end ifdef GPU_ACC -----------*/
+
+    int_t max_ncols = 0;
+#if 0
+    /* symmetric assumption -- using L's supernode to estimate. */
+    /* Note that in following expression 8 can be anything
+       as long as its not too big */
+    int bigu_size = 8 * sp_ienv_dist (3) * (max_row_size);
+#else
+    int_t bigu_size = estimate_bigu_size( nsupers, Ufstnz_br_ptr, Glu_persist,
+    	                                  grid, perm_u, &max_ncols );
+#endif
+
+    /* +16 to avoid cache line false sharing */
+    // int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt),
+    int_t bigv_size = SUPERLU_MAX(max_row_size * max_ncols,
+				  (ldt*ldt + CACHELINE / dword) * num_threads);
+
+    /* bigU and bigV are only allocated on CPU, but may be allocated as
+       page-locked memory accessible to GPU. */
+    float* bigU; /* for storing entire U(k,:) panel, prepare for GEMM. */
+    float* bigV; /* for storing GEMM output matrix, i.e. update matrix.
+	              bigV is large enough to hold the aggregate GEMM output.*/
+    bigU = NULL; /* allocated only on CPU */
+    bigV = NULL;
+
+#if ( PRNTlevel>=1 )
+    if(!iam) {
+        printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8));
+	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
+        printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
+                max_row_size, max_ncols);
+    }
+    printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
+    fflush(stdout);
+#endif
+
+#ifdef GPU_ACC /*-- use GPU --*/
+
+    if ( checkGPU(gpuHostAlloc((void**)&bigU,  bigu_size * sizeof(float), gpuHostAllocDefault)) )
+        ABORT("Malloc fails for sgemm buffer U ");
+
+#if 0 // !!Sherry fix -- only dC on GPU uses buffer_size
+    bigv_size = buffer_size;
+#endif
+
+#if ( PRNTlevel>=1 )
+    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+            iam, bigv_size, buffer_size);
+    fflush(stdout);
+#endif
+
+    if ( checkGPU(gpuHostAlloc((void**)&bigV, bigv_size * sizeof(float) ,gpuHostAllocDefault)) )
+        ABORT("Malloc fails for sgemm buffer V");
+
+#if ( PRNTlevel>=1 )
+    if ( iam==0 ) {
+        DisplayHeader();
+	printf(" Starting with %d gpu streams \n",nstreams );
+        fflush(stdout);
+    }
+#endif
+
+    gpublasHandle_t *handle;
+    handle = (gpublasHandle_t *) SUPERLU_MALLOC(sizeof(gpublasHandle_t)*nstreams);
+    for(int i = 0; i < nstreams; i++) handle[i] = create_handle();
+
+    // creating streams
+    gpuStream_t *streams;
+    streams = (gpuStream_t *) SUPERLU_MALLOC(sizeof(gpuStream_t)*nstreams);
+    for (int i = 0; i < nstreams; ++i)
+        checkGPU( gpuStreamCreate(&streams[i]) );
+
+    // allocating data in device
+    float *dA, *dB, *dC;
+    gpuError_t gpuStat;
+#if 0
+    // gpuStat = gpuMalloc( (void**)&dA, m*k*sizeof(double));
+    // HOw much should be the size of dA?
+    // for time being just making it
+    // gpuStat = gpuMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double));
+#endif
+
+    gpuStat = gpuMalloc( (void**)&dA, max_row_size*sp_ienv_dist(3)* sizeof(float));
+    if (gpuStat!= gpuSuccess) {
+        fprintf(stderr, "!!!! Error in allocating A in the device %ld \n",m*k*sizeof(float) );
+        return 1;
+    }
+
+    // size of B should be bigu_size
+    gpuStat = gpuMalloc((void**)&dB, bigu_size * sizeof(float));
+    if (gpuStat!= gpuSuccess) {
+        fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(float));
+        return 1;
+    }
+
+    gpuStat = gpuMalloc((void**)&dC, buffer_size * sizeof(float) );
+    if (gpuStat!= gpuSuccess) {
+        fprintf(stderr, "!!!! Error in allocating C in the device \n" );
+        return 1;
+    }
+
+    stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA
+                                 + bigu_size                    // dB
+                                 + buffer_size );               // dC
+
+#else  /*-------- not to use GPU --------*/
+
+    // for GEMM padding 0
+    j = bigu_size / ldt;
+    bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad));
+    bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
+
+#if ( PRNTlevel>=1 )
+    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
+    fflush(stdout);
+#endif
+
+//#ifdef __INTEL_COMPILER
+//    bigU = _mm_malloc(bigu_size * sizeof(float), 1<<12); // align at 4K page
+//    bigV = _mm_malloc(bigv_size * sizeof(float), 1<<12);
+//#else
+    if ( !(bigU = floatMalloc_dist(bigu_size)) )
+        ABORT ("Malloc fails for sgemm U buffer");
+    if ( !(bigV = floatMalloc_dist(bigv_size)) )
+        ABORT ("Malloc failed for sgemm V buffer");
+//#endif
+
+#endif 
+/*************** end ifdef GPU_ACC ****************/
+
+    log_memory((bigv_size + bigu_size) * dword, stat);
+
+    // mlock(bigU,(bigu_size) * sizeof (double));
+
+#if ( PRNTlevel>=1 )
+    if(!iam) {
+	printf ("  Max row size is %d \n", max_row_size);
+        printf ("  Threads per process %d \n", num_threads);
+	fflush(stdout);
+    }
+
+#endif
+
+    /* Sherry: (ldt + 16), avoid cache line false sharing.
+       KNL cacheline size = 64 bytes = 16 int */
+    iinfo = ldt + CACHELINE / sizeof(int);
+    if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+    if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int))))
+        ABORT ("Malloc fails for indirect[].");
+
+    log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword, stat);
+
+    int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib,
+          *RemainStRow,*Remain_lptr,*Remain_ib;
+
+    lookAheadFullRow   = intMalloc_dist( (num_look_aheads+1) );
+    lookAheadStRow     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_lptr     = intMalloc_dist( (num_look_aheads+1) );
+    lookAhead_ib       = intMalloc_dist( (num_look_aheads+1) );
+
+    int_t mrb = (nsupers + Pr - 1) / Pr;
+    int_t mcb = (nsupers + Pc - 1) / Pc;
+
+    RemainStRow     = intMalloc_dist(mrb);
+#if 0
+    Remain_lptr     = (int *) _mm_malloc(sizeof(int)*mrb,1);
+#else
+    Remain_lptr     = intMalloc_dist(mrb);
+#endif
+    // mlock(Remain_lptr, sizeof(int)*mrb );
+    Remain_ib       = intMalloc_dist(mrb);
+
+    Remain_info_t *Remain_info;
+#if 0
+    Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64);
+#else
+    Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t));
+#endif
+
+    float *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */
+    Ublock_info_t *Ublock_info;
+    ldt = sp_ienv_dist (3); /* max supernode size */
+    /* The following is quite loose */
+    lookAhead_L_buff = floatMalloc_dist(ldt*ldt* (num_look_aheads+1) );
+
+#if 0
+    Remain_L_buff = (float *) _mm_malloc( sizeof(float)*(Llu->bufmax[1]),64);
+    Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64);
+    /*int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+      int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64);
+      int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); */
+#else
+    j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad);
+    Remain_L_buff = floatMalloc_dist(Llu->bufmax[1] + j); /* This is loose */
+    Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t));
+    /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+      int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int));
+      int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */
+#endif
+
+    long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t)
+                        + ldt * ldt * (num_look_aheads+1) * dword
+ 			+ Llu->bufmax[1] * dword ;
+    log_memory(alloc_mem, stat);
+
+    InitTimer = SuperLU_timer_() - tt1;
+
+    double pxgstrfTimer = SuperLU_timer_();
+
+    /* ##################################################################
+       ** Handle first block column separately to start the pipeline. **
+       ################################################################## */
+    look_id = 0;
+    msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */
+    send_req = send_reqs[0];
+    recv_req = recv_reqs[0];
+
+    k0 = 0;
+    k = perm_c_supno[0];
+    kcol = PCOL (k, grid);
+    krow = PROW (k, grid);
+    if (mycol == kcol) {
+        double ttt1 = SuperLU_timer_();
+
+	/* panel factorization */
+        PSGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_()-ttt1;
+
+        scp = &grid->rscp;      /* The scope of process row. */
+
+        /* Multicasts numeric values of L(:,0) to process rows. */
+        lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        if (lsub) {
+	    /* number of entries in Lsub_buf[] to be transferred */
+            msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+	    /* number of entries in Lval_buf[] to be transferred */
+            msgcnt[1] = lsub[1] * SuperSize (k);
+        } else {
+            msgcnt[0] = msgcnt[1] = 0;
+        }
+
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+
+                MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, 0) /* 0 */,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj,
+                           SLU_MPI_TAG (1, 0) /* 1 */,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
+                        iam, 0, msgcnt[0], msgcnt[1], pj);
+#endif
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
+		++prof_sendR[lk];
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+            } /* end if */
+        }  /* end for pj ... */
+    } else {  /* Post immediate receives. */
+        if (ToRecv[k] >= 1) {   /* Recv block column L(:,0). */
+            scp = &grid->rscp;  /* The scope of process row. */
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
+            MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol,
+                       SLU_MPI_TAG (0, 0) /* 0 */ ,
+                       scp->comm, &recv_req[0]);
+            MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_FLOAT, kcol,
+                       SLU_MPI_TAG (1, 0) /* 1 */ ,
+                       scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
+        }
+    } /* end if mycol == 0 */
+
+    factored[k] = 0; /* flag column k as factored. */
+
+    /* post receive of first U-row */
+    if (myrow != krow) {
+        if (ToRecv[k] == 2) {   /* Recv block row U(k,:). */
+            scp = &grid->cscp;  /* The scope of process column. */
+            Usub_buf = Llu->Usub_buf_2[0];
+            Uval_buf = Llu->Uval_buf_2[0];
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
+            MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                       SLU_MPI_TAG (2, 0) /* 2%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][0]);
+            MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow,
+                       SLU_MPI_TAG (3, 0) /* 3%tag_ub */ ,
+                       scp->comm, &recv_reqs_u[0][1]);
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DOWN] += t2;
+#endif
+        }
+    }
+
+    /* ##################################################################
+       **** MAIN LOOP ****
+       ################################################################## */
+    for (k0 = 0; k0 < nsupers; ++k0) {
+        k = perm_c_supno[k0];
+
+        /* ============================================ *
+         * ======= look-ahead the new L columns ======= *
+         * ============================================ */
+        /* tt1 = SuperLU_timer_(); */
+        if (k0 == 0) { /* look-ahead all the columns in the window */
+            kk1 = k0 + 1;
+            kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        } else {  /* look-ahead one new column after the current window */
+            kk1 = k0 + num_look_aheads;
+            kk2 = SUPERLU_MIN (kk1, nsupers - 1);
+        }
+
+        for (kk0 = kk1; kk0 <= kk2; kk0++) {
+	    /* loop through look-ahead window in L */
+
+            kk = perm_c_supno[kk0]; /* use the ordering from static schedule */
+            look_id = kk0 % (1 + num_look_aheads); /* which column in window */
+
+            if (look_ahead[kk] < k0) { /* does not depend on current column k */
+                kcol = PCOL (kk, grid);
+                if (mycol == kcol) { /* I own this panel */
+
+                    /* Panel factorization -- Factor diagonal and subdiagonal
+                       L blocks and test for exact singularity.  */
+                    factored[kk] = 0; /* flag column kk as factored */
+                    double ttt1 = SuperLU_timer_();
+
+                    PSGSTRF2 (options, kk0, kk, thresh, Glu_persist,
+                              grid, Llu, U_diag_blk_send_req, tag_ub, stat, info);
+
+                     pdgstrf2_timer += SuperLU_timer_() - ttt1;
+
+                    /* Multicasts numeric values of L(:,kk) to process rows. */
+                    /* ttt1 = SuperLU_timer_(); */
+                    msgcnt = msgcnts[look_id];  /* point to the proper count array */
+                    send_req = send_reqs[look_id];
+
+                    lk = LBj (kk, grid);    /* Local block number in L. */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    if (lsub1) {
+                        msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */
+                        msgcnt[1] = lsub1[1] * SuperSize (kk); /* Lval_buf[] size */
+                    } else {
+                        msgcnt[0] = 0;
+                        msgcnt[1] = 0;
+                    }
+                    scp = &grid->rscp;  /* The scope of process row. */
+                    for (pj = 0; pj < Pc; ++pj) {
+                        if (ToSendR[lk][pj] != EMPTY) {
+                            lusup1 = Lnzval_bc_ptr[lk];
+#if ( PROFlevel>=1 )
+			    TIC (t1);
+#endif
+                            MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                       SLU_MPI_TAG (0, kk0),  /* (4*kk0)%tag_ub */
+                                       scp->comm, &send_req[pj]);
+                            MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj,
+                                       SLU_MPI_TAG (1, kk0),  /* (4*kk0+1)%tag_ub */
+                                       scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+			    TOC (t2, t1);
+			    stat->utime[COMM] += t2;
+			    stat->utime[COMM_RIGHT] += t2;
+			    ++prof_sendR[lk];
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n",
+				    iam, kk, msgcnt[0], msgcnt[1], pj);
+#endif
+                        }
+                    }
+                    /* stat->time9 += SuperLU_timer_() - ttt1; */
+                } else {     /* Post Recv of block column L(:,kk). */
+                    /* double ttt1 = SuperLU_timer_(); */
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   MPI_FLOAT, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
+                    }
+                    /* stat->time10 += SuperLU_timer_() - ttt1; */
+                }  /* end if mycol == Pc(kk) */
+            }  /* end if look-ahead in L panels */
+
+            /* Pre-post irecv for U-row look-ahead */
+            krow = PROW (kk, grid);
+            if (myrow != krow) {
+                if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    Usub_buf = Llu->Usub_buf_2[look_id];
+                    Uval_buf = Llu->Uval_buf_2[look_id];
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
+                    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+                               SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][0]);
+                    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow,
+                               SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ ,
+                               scp->comm, &recv_reqs_u[look_id][1]);
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
+                }
+            }
+
+        }  /* end for each column in look-ahead window for L panels */
+
+        /* stat->time4 += SuperLU_timer_()-tt1; */
+
+        /* ================================= *
+         * ==== look-ahead the U rows    === *
+         * ================================= */
+        kk1 = k0;
+        kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = kk1; kk0 < kk2; kk0++) {
+            kk = perm_c_supno[kk0]; /* order determined from static schedule */
+            if (factoredU[kk0] != 1 && look_ahead[kk] < k0) {
+		/* does not depend on current column k */
+                kcol = PCOL (kk, grid);
+                krow = PROW (kk, grid);
+                lk = LBj (kk, grid);  /* Local block number across row. NOT USED?? -- Sherry */
+
+                look_id = kk0 % (1 + num_look_aheads);
+                msgcnt = msgcntsU[look_id];
+                recv_req = recv_reqs[look_id];
+
+                /* ================================================= *
+                 * Check if diagonal block has been received         *
+                 * for panel factorization of U in look-ahead window *
+                 * ================================================= */
+
+                if (mycol == kcol) {  /* I own this column panel, no need
+                                         to receive L  */
+                    flag0 = flag1 = 1;
+                    msgcnt[0] = msgcnt[1] = -1; /* No need to transfer Lsub, nor Lval */
+                } else { /* Check to receive L(:,kk) from the left */
+                    flag0 = flag1 = 0;
+                    if ( ToRecv[kk] >= 1 ) {
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
+                        if ( recv_req[0] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[0], &flag0, &status);
+                            if ( flag0 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                                recv_req[0] = MPI_REQUEST_NULL;
+                            }
+                        } else flag0 = 1;
+
+                        if ( recv_req[1] != MPI_REQUEST_NULL ) {
+                            MPI_Test (&recv_req[1], &flag1, &status);
+                            if ( flag1 ) {
+                                MPI_Get_count (&status, mpi_int_t, &msgcnt[1]);
+                                recv_req[1] = MPI_REQUEST_NULL;
+                            }
+                        } else flag1 = 1;
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
+                    } else {
+                        msgcnt[0] = 0;
+ 	            }
+                }
+
+                if (flag0 && flag1) { /* L(:,kk) is ready */
+                    /* tt1 = SuperLU_timer_(); */
+                    scp = &grid->cscp;  /* The scope of process column. */
+                    if (myrow == krow) {
+                        factoredU[kk0] = 1;
+                        /* Parallel triangular solve across process row *krow* --
+                           U(k,j) = L(k,k) \ A(k,j).  */
+                        double ttt2 = SuperLU_timer_();
+#ifdef _OPENMP
+/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */
+#endif
+			{
+                            psgstrs2_omp (kk0, kk, Glu_persist, grid, Llu,
+                                        Ublock_info, stat);
+                        }
+
+                        pdgstrs2_timer += SuperLU_timer_()-ttt2;
+                        /* stat->time8 += SuperLU_timer_()-ttt2; */
+
+                        /* Multicasts U(kk,:) to process columns. */
+                        lk = LBi (kk, grid);
+                        usub = Ufstnz_br_ptr[lk];
+                        uval = Unzval_br_ptr[lk];
+                        if (usub) {
+                            msgcnt[2] = usub[2]; /* metadata size */
+                            msgcnt[3] = usub[1]; /* Uval[] size */
+                        } else {
+                            msgcnt[2] = msgcnt[3] = 0;
+                        }
+
+                        if (ToSendD[lk] == YES) {
+                            for (pi = 0; pi < Pr; ++pi) {
+                                if (pi != myrow) {
+#if ( PROFlevel>=1 )
+                                    TIC (t1);
+#endif
+
+                                    MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                                               SLU_MPI_TAG (2, kk0), /* (4*kk0+2)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi]);
+                                    MPI_Isend (uval, msgcnt[3], MPI_FLOAT,
+                                               pi, SLU_MPI_TAG (3, kk0), /* (4*kk0+3)%tag_ub */
+                                               scp->comm, &send_reqs_u[look_id][pi + Pr]);
+
+#if ( PROFlevel>=1 )
+                                    TOC (t2, t1);
+                                    stat->utime[COMM] += t2;
+                                    msg_cnt += 2;
+                                    msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                                    printf ("[%d] Send U(%4d,:) to Pr %2d\n",
+                                            iam, k, pi);
+#endif
+                                }   /* if pi ... */
+                            }   /* for pi ... */
+                        }       /* if ToSendD ... */
+
+                        /* stat->time2 += SuperLU_timer_()-tt1; */
+
+                    } /* end if myrow == krow */
+                } /* end if flag0 & flag1 ... */
+            } /* end if factoredU[] ... */
+        } /* end for kk0 ... */
+
+        /* ============================================== *
+         * == start processing the current row of U(k,:) *
+         * ============================================== */
+        knsupc = SuperSize (k);
+        krow = PROW (k, grid);
+        kcol = PCOL (k, grid);
+
+        /* tt1 = SuperLU_timer_(); */
+        look_id = k0 % (1 + num_look_aheads);
+        recv_req = recv_reqs[look_id];
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+        Usub_buf = Llu->Usub_buf_2[look_id];
+        Uval_buf = Llu->Uval_buf_2[look_id];
+
+        if (mycol == kcol) {
+            lk = LBj (k, grid); /* Local block number in L */
+
+#if ( PROFlevel>=1 )
+	    TIC(t1);
+#endif
+            for (pj = 0; pj < Pc; ++pj) {
+                /* Wait for Isend to complete before using lsub/lusup buffer. */
+                if (ToSendR[lk][pj] != EMPTY) {
+                    MPI_Wait (&send_req[pj], &status);
+                    MPI_Wait (&send_req[pj + Pc], &status);
+                }
+            }
+#if ( PROFlevel>=1 )
+	    TOC(t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_RIGHT] += t2;
+#endif
+            lsub = Lrowind_bc_ptr[lk];
+            lusup = Lnzval_bc_ptr[lk];
+        } else {
+            if (ToRecv[k] >= 1) { /* Recv block column L(:,k). */
+
+                scp = &grid->rscp;  /* The scope of process row. */
+
+                /* ============================================= *
+                 * Waiting for L(:,kk) for outer-product uptate  *
+                 * if iam in U(kk,:), then the diagonal block    *
+                 * did not reach in time for panel factorization *
+                 * of U(k,:).          	                         *
+                 * ============================================= */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                if (recv_req[0] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[0], &status);
+                    MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+                    recv_req[0] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[0] = msgcntsU[look_id][0];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n",
+			   iam, k, look_id, msgcnt[0]);
+#endif
+                }
+
+                if (recv_req[1] != MPI_REQUEST_NULL) {
+                    MPI_Wait (&recv_req[1], &status);
+                    MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]);
+                    recv_req[1] = MPI_REQUEST_NULL;
+                } else {
+                    msgcnt[1] = msgcntsU[look_id][1];
+#if (DEBUGlevel>=2)
+		    printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n",
+			   iam, k, look_id, msgcnt[1]);
+#endif
+                }
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                stat->utime[COMM_RIGHT] += t2;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n",
+                     iam, k, msgcnt[0], msgcnt[1], kcol);
+                fflush (stdout);
+#endif
+
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[0])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[0] = 0;
+	    }
+
+            lsub = Lsub_buf_2[look_id];
+            lusup = Lval_buf_2[look_id];
+        }  /* else if mycol = Pc(k) */
+        /* stat->time1 += SuperLU_timer_()-tt1; */
+
+        scp = &grid->cscp;      /* The scope of process column. */
+
+        /* tt1 = SuperLU_timer_(); */
+        if (myrow == krow) { /* I own U(k,:) */
+            lk = LBi (k, grid);
+            usub = Ufstnz_br_ptr[lk];
+            uval = Unzval_br_ptr[lk];
+
+            if (factoredU[k0] == -1) {
+                /* Parallel triangular solve across process row *krow* --
+                   U(k,j) = L(k,k) \ A(k,j).  */
+                 double ttt2 = SuperLU_timer_();
+#ifdef _OPENMP
+/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */
+#endif
+                {
+                    psgstrs2_omp (k0, k, Glu_persist, grid, Llu, 
+		                    Ublock_info, stat);
+                }
+                pdgstrs2_timer += SuperLU_timer_() - ttt2;
+
+	        /* Sherry -- need to set factoredU[k0] = 1; ?? */
+
+                /* Multicasts U(k,:) along process columns. */
+                if ( usub ) {
+                    msgcnt[2] = usub[2]; /* metadata size */
+                    msgcnt[3] = usub[1]; /* Uval[] size */
+                } else {
+                    msgcnt[2] = msgcnt[3] = 0;
+                }
+
+                if (ToSendD[lk] == YES) {
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) { /* Matching recv was pre-posted before */
+#if ( PROFlevel>=1 )
+                            TIC (t1);
+#endif
+                            MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                                      SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */
+                                      scp->comm);
+                            MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi,
+                                      SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */
+                                      scp->comm);
+#if ( PROFlevel>=1 )
+                            TOC (t2, t1);
+                            stat->utime[COMM] += t2;
+                            stat->utime[COMM_DOWN] += t2;
+                            msg_cnt += 2;
+                            msg_vol += msgcnt[2] * iword + msgcnt[3] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                            printf ("[%d] Send U(%4d,:) down to Pr %2d\n", iam, k, pi);
+#endif
+                        } /* if pi ... */
+                    } /* for pi ... */
+                } /* if ToSendD ... */
+
+            } else { /* Panel U(k,:) already factorized from previous look-ahead */
+
+               /* ================================================ *
+                * Wait for downward sending of U(k,:) to complete  *
+		* for outer-product update.                        *
+                * ================================================ */
+
+                if (ToSendD[lk] == YES) {
+#if ( PROFlevel>=1 )
+		    TIC (t1);
+#endif
+                    for (pi = 0; pi < Pr; ++pi) {
+                        if (pi != myrow) {
+                            MPI_Wait (&send_reqs_u[look_id][pi], &status);
+                            MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status);
+                        }
+                    }
+#if ( PROFlevel>=1 )
+		    TOC (t2, t1);
+		    stat->utime[COMM] += t2;
+		    stat->utime[COMM_DOWN] += t2;
+#endif
+                }
+                msgcnt[2] = msgcntsU[look_id][2];
+                msgcnt[3] = msgcntsU[look_id][3];
+            }
+            /* stat->time2 += SuperLU_timer_()-tt1; */
+
+        } else {    /* myrow != krow */
+
+            /* ========================================== *
+             * Wait for U(k,:) for outer-product updates. *
+             * ========================================== */
+
+            if (ToRecv[k] == 2) { /* Recv block row U(k,:). */
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Wait (&recv_reqs_u[look_id][0], &status);
+                MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+                MPI_Wait (&recv_reqs_u[look_id][1], &status);
+                MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]);
+
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                stat->utime[COMM_DOWN] += t2;
+#endif
+                usub = Usub_buf;
+                uval = Uval_buf;
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
+#endif
+#if ( PRNTlevel==3 )
+                ++total_msg;
+                if (!msgcnt[2])  ++zero_msg;
+#endif
+            } else {
+                msgcnt[2] = 0;
+	    }
+            /* stat->time6 += SuperLU_timer_()-tt1; */
+        } /* end if myrow == Pr(k) */
+
+        /*
+         * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
+         *  for (j = k+1; k < N; ++k) {
+         *     for (i = k+1; i < N; ++i)
+         *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+         *              && L(i,k) != 0 && U(k,j) != 0 )
+         *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
+         */
+        msg0 = msgcnt[0];
+        msg2 = msgcnt[2];
+        /* tt1 = SuperLU_timer_(); */
+        if (msg0 && msg2) {     /* L(:,k) and U(k,:) are not empty. */
+            nsupr = lsub[1];    /* LDA of lusup. */
+            if (myrow == krow) { /* Skip diagonal block L(k,k). */
+                lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+                luptr0 = knsupc;
+                nlb = lsub[0] - 1;
+            } else {
+                lptr0 = BC_HEADER;
+                luptr0 = 0;
+                nlb = lsub[0];
+            }
+            iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+            rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+            nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+            klst = FstBlockC (k + 1);
+
+            /* -------------------------------------------------------------
+               Update the look-ahead block columns A(:,k+1:k+num_look_ahead)
+               ------------------------------------------------------------- */
+            iukp0 = iukp;
+            rukp0 = rukp;
+            /* reorder the remaining columns in bottome-up */
+            /* TAU_STATIC_TIMER_START("LOOK_AHEAD_UPDATE"); */
+            for (jj = 0; jj < nub; jj++) {
+#ifdef ISORT
+                iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+                perm_u[jj] = jj;
+#else
+                perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+                perm_u[2 * jj + 1] = jj;
+#endif
+                jb = usub[iukp];    /* Global block number of block U(k,j). */
+                nsupc = SuperSize (jb);
+                iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+                iukp += nsupc;
+            }
+            iukp = iukp0;
+#ifdef ISORT
+            /* iperm_u is sorted based on elimination order;
+               perm_u reorders the U blocks to match the elimination order. */
+            isort (nub, iperm_u, perm_u);
+#else
+            qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+                   &superlu_sort_perm);
+#endif
+
+/************************************************************************/
+            double ttx =SuperLU_timer_();
+
+//#include "slook_ahead_update_v4.c"
+#include "slook_ahead_update.c"
+
+            lookaheadupdatetimer += SuperLU_timer_() - ttx;
+/************************************************************************/
+
+            /*ifdef OMP_LOOK_AHEAD */
+            /* TAU_STATIC_TIMER_STOP("LOOK_AHEAD_UPDATE"); */
+        }                       /* if L(:,k) and U(k,:) not empty */
+
+        /* stat->time3 += SuperLU_timer_()-tt1; */
+
+        /* ================== */
+        /* == post receive == */
+        /* ================== */
+        kk1 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1);
+        for (kk0 = k0 + 1; kk0 <= kk1; kk0++) {
+            kk = perm_c_supno[kk0];
+            kcol = PCOL (kk, grid);
+
+            if (look_ahead[kk] == k0) {
+                if (mycol != kcol) {
+                    if (ToRecv[kk] >= 1) {
+                        scp = &grid->rscp;  /* The scope of process row. */
+
+                        look_id = kk0 % (1 + num_look_aheads);
+                        recv_req = recv_reqs[look_id];
+#if ( PROFlevel>=1 )
+			TIC (t1);
+#endif
+                        MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0],
+                                   mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                   scp->comm, &recv_req[0]);
+                        MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1],
+                                   MPI_FLOAT, kcol,
+                                   SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                   scp->comm, &recv_req[1]);
+#if ( PROFlevel>=1 )
+			TOC (t2, t1);
+			stat->utime[COMM] += t2;
+			stat->utime[COMM_RIGHT] += t2;
+#endif
+                    }
+                } else {
+                    lk = LBj (kk, grid);    /* Local block number. */
+                    lsub1 = Lrowind_bc_ptr[lk];
+                    lusup1 = Lnzval_bc_ptr[lk];
+                    if (factored[kk] == -1) {
+                        /* Factor diagonal and subdiagonal blocks and
+			   test for exact singularity.  */
+                        factored[kk] = 0; /* flag column kk as factored */
+                        double ttt1 = SuperLU_timer_();
+                        PSGSTRF2 (options, kk0, kk, thresh,
+                                  Glu_persist, grid, Llu, U_diag_blk_send_req,
+                                  tag_ub, stat, info);
+                        pdgstrf2_timer += SuperLU_timer_() - ttt1;
+
+                        /* Process column *kcol+1* multicasts numeric
+			   values of L(:,k+1) to process rows. */
+                        look_id = kk0 % (1 + num_look_aheads);
+                        send_req = send_reqs[look_id];
+                        msgcnt = msgcnts[look_id];
+
+                        if (lsub1) {
+                            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+                            msgcnt[1] = lsub1[1] * SuperSize (kk);
+                        } else {
+                            msgcnt[0] = 0;
+                            msgcnt[1] = 0;
+                        }
+
+                        scp = &grid->rscp;  /* The scope of process row. */
+                        for (pj = 0; pj < Pc; ++pj) {
+                            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+			       TIC (t1);
+#endif
+                                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                                           SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */
+                                           scp->comm, &send_req[pj]);
+                                MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj,
+                                           SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */
+                                           scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+				TOC (t2, t1);
+				stat->utime[COMM] += t2;
+				stat->utime[COMM_RIGHT] += t2;
+				++prof_sendR[lk];
+#endif
+                            }
+                        } /* end for pj ... */
+                    } /* if    factored[kk] ... */
+                }
+            }
+        }
+
+        double tsch = SuperLU_timer_();
+
+	/*******************************************************************/
+
+#ifdef GPU_ACC /*-- GPU --*/
+
+#include "sSchCompUdt-cuda.c"
+
+#else
+
+/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/
+//#include "sSchCompUdt-2Ddynamic_v6.c"
+
+#include "sSchCompUdt-2Ddynamic.c"
+
+#endif
+	/*uncomment following to compare against SuperLU 3.3 baseline*/
+        /* #include "SchCompUdt--baseline.c"  */
+	/************************************************************************/
+
+        NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+    }  /* MAIN LOOP for k0 = 0, ... */
+
+    /* ##################################################################
+       ** END MAIN LOOP: for k0 = ...
+       ################################################################## */
+
+    pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer;
+
+#if ( PRNTlevel>=2 )
+    /* Print detailed statistics */
+    /* Updating total flops */
+    double allflops;
+    MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
+	       0, grid->comm);
+    if ( iam==0 ) {
+	printf("\nInitialization time\t%8.4lf seconds\n"
+	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
+        printf("\n==== Time breakdown in factorization (rank 0) ====\n");
+	printf("Panel factorization \t %8.4lf seconds\n",
+	       pdgstrf2_timer + pdgstrs2_timer);
+	printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer);
+	printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer);
+	printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer);
+        printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer);
+        printf(".. Time to Gather L buffer\t %8.4lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
+        printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer);
+
+        printf(".. Time in GEMM %8.4lf \n",
+	       LookAheadGEMMTimer + RemainGEMMTimer);
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer);
+        printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n",
+	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
+        printf(".. Time to Scatter %8.4lf \n",
+	       LookAheadScatterTimer + RemainScatterTimer);
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer);
+        printf("\t* Remain\t %8.4lf \n", RemainScatterTimer);
+
+        printf("Total factorization time            \t: %8.4lf seconds, \n", pxgstrfTimer);
+        printf("--------\n");
+	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            sPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
+            sPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+    /********************************************************
+     * Free memory                                          *
+     ********************************************************/
+
+    if (Pr * Pc > 1) {
+        SUPERLU_FREE (Lsub_buf_2[0]);   /* also free Lsub_buf_2[1] */
+        SUPERLU_FREE (Lval_buf_2[0]);   /* also free Lval_buf_2[1] */
+        if (Llu->bufmax[2] != 0)
+            SUPERLU_FREE (Usub_buf_2[0]);
+        if (Llu->bufmax[3] != 0)
+            SUPERLU_FREE (Uval_buf_2[0]);
+        if (U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) {
+            /* wait for last Isend requests to complete, deallocate objects */
+            for (krow = 0; krow < Pr; ++krow) {
+                if (krow != myrow)
+                    MPI_Wait (U_diag_blk_send_req + krow, &status);
+            }
+        }
+        SUPERLU_FREE (U_diag_blk_send_req);
+    }
+
+    log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword +
+		  (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword),
+		stat );
+
+    SUPERLU_FREE (Lsub_buf_2);
+    SUPERLU_FREE (Lval_buf_2);
+    SUPERLU_FREE (Usub_buf_2);
+    SUPERLU_FREE (Uval_buf_2);
+    SUPERLU_FREE (perm_c_supno);
+    SUPERLU_FREE (perm_u);
+#ifdef ISORT
+    SUPERLU_FREE (iperm_u);
+#endif
+    SUPERLU_FREE (look_ahead);
+    SUPERLU_FREE (factoredU);
+    SUPERLU_FREE (factored);
+    log_memory(-(6 * nsupers * iword), stat);
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (msgcnts[i]);
+        SUPERLU_FREE (msgcntsU[i]);
+    }
+    SUPERLU_FREE (msgcnts);
+    SUPERLU_FREE (msgcntsU);
+
+    for (i = 0; i <= num_look_aheads; i++) {
+        SUPERLU_FREE (send_reqs_u[i]);
+        SUPERLU_FREE (recv_reqs_u[i]);
+        SUPERLU_FREE (send_reqs[i]);
+        SUPERLU_FREE (recv_reqs[i]);
+    }
+
+    SUPERLU_FREE (recv_reqs_u);
+    SUPERLU_FREE (send_reqs_u);
+    SUPERLU_FREE (recv_reqs);
+    SUPERLU_FREE (send_reqs);
+
+#ifdef GPU_ACC
+    checkGPU (gpuFreeHost (bigV));
+    checkGPU (gpuFreeHost (bigU));
+    gpuFree( (void*)dA ); /* Sherry added */
+    gpuFree( (void*)dB );
+    gpuFree( (void*)dC );
+    SUPERLU_FREE( handle );
+    SUPERLU_FREE( streams );
+    SUPERLU_FREE( stream_end_col );
+#else
+//  #ifdef __INTEL_COMPILER
+//    _mm_free (bigU);
+//    _mm_free (bigV);
+//  #else
+    SUPERLU_FREE (bigV);
+    SUPERLU_FREE (bigU);
+//  #endif
+    /* Decrement freed memory from memory stat. */
+    log_memory(-(bigv_size + bigu_size) * dword, stat);
+#endif
+
+    SUPERLU_FREE (Llu->ujrow);
+    // SUPERLU_FREE (tempv2d);/* Sherry */
+    SUPERLU_FREE (indirect);
+    SUPERLU_FREE (indirect2); /* Sherry added */
+
+    ldt = sp_ienv_dist(3);
+    log_memory( -(3 * ldt *ldt * dword + 2 * ldt * num_threads * iword), stat );
+
+    /* Sherry added */
+    SUPERLU_FREE(omp_loop_time);
+    SUPERLU_FREE(full_u_cols);
+    SUPERLU_FREE(blk_ldu);
+#if ( PRNTlevel>=1 )
+    log_memory(-2 * ncb * dword, stat);
+#endif
+
+    SUPERLU_FREE(lookAheadFullRow);
+    SUPERLU_FREE(lookAheadStRow);
+    SUPERLU_FREE(lookAhead_lptr);
+    SUPERLU_FREE(lookAhead_ib);
+
+    SUPERLU_FREE(RemainStRow);
+    SUPERLU_FREE(Remain_lptr);
+    SUPERLU_FREE(Remain_ib);
+    SUPERLU_FREE(Remain_info);
+    SUPERLU_FREE(lookAhead_L_buff);
+    SUPERLU_FREE(Remain_L_buff);
+    log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) +
+		  ldt * ldt * (num_look_aheads + 1) * dword +
+		  Llu->bufmax[1] * dword), stat );
+
+    SUPERLU_FREE(Ublock_info);
+    /*SUPERLU_FREE(Ublock_info_iukp);
+      SUPERLU_FREE(Ublock_info_rukp);
+      SUPERLU_FREE(Ublock_info_jb);  */
+
+
+#if ( PROFlevel>=1 )
+    TIC (t1);
+#endif
+
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+
+#if ( PROFlevel>=1 )
+    TOC (t2, t1);
+    stat->utime[COMM] += t2;
+    {
+        float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+
+        MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_cnt, &msg_cnt_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_sum,
+                    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+        MPI_Reduce (&msg_vol, &msg_vol_max,
+                    1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        if ( iam==0 ) {
+            printf ("\tPSGSTRF comm stat:"
+                    "\tAvg\tMax\t\tAvg\tMax\n"
+                    "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+                    msg_cnt_sum / Pr / Pc, msg_cnt_max,
+                    msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+	    printf("\t\tcomm time on task 0: %8.2lf\n"
+		   "\t\t\tcomm down DIAG block %8.2lf\n"
+		   "\t\t\tcomm right L panel %8.2lf\n"
+		   "\t\t\tcomm down U panel %8.2lf\n",
+		   stat->utime[COMM], stat->utime[COMM_DIAG],
+		   stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]);
+	    //#include <float.h>
+	    //int Digs = DECIMAL_DIG;
+	    printf("gemm_count %d\n", gemm_count);
+	    for (i = 0; i < gemm_count; ++i)
+		fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n,
+			gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]);
+
+	    fclose(fgemm);
+        }
+	SUPERLU_FREE(gemm_stats);
+	SUPERLU_FREE(prof_sendR);
+    }
+#endif
+
+#if ( PRNTlevel==3 )
+    MPI_Allreduce (&zero_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # msg of zero size\t%d\n", iinfo);
+    MPI_Allreduce (&total_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm);
+    if (!iam)
+        printf (".. # total msg\t%d\n", iinfo);
+#endif
+
+#if ( DEBUGlevel>=3 )
+    for (i = 0; i < Pr * Pc; ++i) {
+        if (iam == i) {
+            sPrintLblocks (iam, nsupers, grid, Glu_persist, Llu);
+            sPrintUblocks (iam, nsupers, grid, Glu_persist, Llu);
+            printf ("(%d)\n", iam);
+            PrintInt10 ("Recv", nsupers, Llu->ToRecv);
+        }
+        MPI_Barrier (grid->comm);
+    }
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
+#endif
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit psgstrf()");
+#endif
+
+    return 0;
+} /* PSGSTRF */
+
diff --git a/SRC/psgstrf2.c b/SRC/psgstrf2.c
new file mode 100644
index 00000000..b07701b2
--- /dev/null
+++ b/SRC/psgstrf2.c
@@ -0,0 +1,947 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs panel LU factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified:
+ *   September 30, 2017
+ *   May 10, 2019 version 7.0.0
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+//#include "cblas.h"
+
+/*****************************************************************************
+ * The following psgstrf2_trsm is in version 6 and earlier.
+ *****************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+/* This pdgstrf2 is based on TRSM function */
+void
+psgstrf2_trsm
+    (superlu_dist_options_t * options, int_t k0, int_t k, double thresh,
+     Glu_persist_t * Glu_persist, gridinfo_t * grid, sLocalLU_t * Llu,
+     MPI_Request * U_diag_blk_send_req, int tag_ub,
+     SuperLUStat_t * stat, int *info)
+{
+    /* printf("entering psgstrf2 %d \n", grid->iam); */
+    int cols_left, iam, l, pkk, pr;
+    int incx = 1, incy = 1;
+
+    int nsupr;            /* number of rows in the block (LDA) */
+    int nsupc;            /* number of columns in the block */
+    int luptr;
+    int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt;
+    int_t *xsup = Glu_persist->xsup;
+    float *lusup, temp;
+    float *ujrow, *ublk_ptr;   /* pointer to the U block */
+    float alpha = -1, zero = 0.0;
+    int_t Pr;
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    double t1, t2;
+
+    /* Initialization. */
+    iam = grid->iam;
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    krow = PROW (k, grid);
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    j = LBj (k, grid);          /* Local block number */
+    jfst = FstBlockC (k);
+    jlst = FstBlockC (k + 1);
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize (k);
+    if (Llu->Lrowind_bc_ptr[j])
+        nsupr = Llu->Lrowind_bc_ptr[j][1];
+    else
+        nsupr = 0;
+#ifdef PI_DEBUG
+    printf ("rank %d  Iter %d  k=%d \t strsm nsuper %d \n",
+            iam, k0, k, nsupr);
+#endif
+    ublk_ptr = ujrow = Llu->ujrow;
+
+    luptr = 0;                  /* Point to the diagonal entries. */
+    cols_left = nsupc;          /* supernode size */
+    int ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    u_diag_cnt = 0;
+    incy = ld_ujrow;
+
+    if ( U_diag_blk_send_req &&
+	 U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) {
+        /* There are pending sends - wait for all Isend to complete */
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
+        for (pr = 0; pr < Pr; ++pr) {
+            if (pr != myrow) {
+                MPI_Wait (U_diag_blk_send_req + pr, &status);
+            }
+	}
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
+	/* flag no more outstanding send request. */
+	U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL;
+    }
+
+    if (iam == pkk) {            /* diagonal process */
+	/* ++++ First step compute diagonal block ++++++++++ */
+        for (j = 0; j < jlst - jfst; ++j) {  /* for each column in panel */
+            /* Diagonal pivot */
+            i = luptr;
+            /* May replace zero pivot.  */
+            if (options->ReplaceTinyPivot == YES )  {
+                if (fabs (lusup[i]) < thresh) {  /* Diagonal */
+
+#if ( PRNTlevel>=2 )
+                    printf ("(%d) .. col %d, tiny pivot %e  ",
+                            iam, jfst + j, lusup[i]);
+#endif
+                    /* Keep the new diagonal entry with the same sign. */
+                    if (lusup[i] < 0)  lusup[i] = -thresh;
+                    else  lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+                    printf ("replaced by %e\n", lusup[i]);
+#endif
+                    ++(stat->TinyPivots);
+                }
+            }
+
+#if 0
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+                 ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */
+#endif
+
+            /* storing U in full form  */
+            int st;
+            for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) {
+                st = j * ld_ujrow + j;
+                ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */
+            }
+
+            if ( ujrow[0] == zero ) { /* Test for singularity. */
+                *info = j + jfst + 1;
+            } else {              /* Scale the j-th column within diag. block. */
+                temp = 1.0 / ujrow[0];
+                for (i = luptr + 1; i < luptr - j + nsupc; ++i)
+		    lusup[i] *= temp;
+                stat->ops[FACT] += nsupc - j - 1;
+            }
+
+            /* Rank-1 update of the trailing submatrix within diag. block. */
+            if (--cols_left) {
+                /* l = nsupr - j - 1;  */
+                l = nsupc - j - 1;  /* Piyush */
+                sger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
+                       &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
+                       &nsupr);
+                stat->ops[FACT] += 2 * l * cols_left;
+            }
+
+            /* ujrow = ublk_ptr + u_diag_cnt;  */
+            ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */
+            luptr += nsupr + 1; /* move to next column */
+
+        }                       /* for column j ...  first loop */
+
+	/* ++++ Second step compute off-diagonal block with communication  ++*/
+
+        ublk_ptr = ujrow = Llu->ujrow;
+
+        if (U_diag_blk_send_req && iam == pkk)  { /* Send the U block downward */
+            /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+#if ( PROFlevel>=1 )
+	    TIC (t1);
+#endif
+            for (pr = 0; pr < Pr; ++pr) {
+                if (pr != krow) {
+                    /* tag = ((k0<<2)+2) % tag_ub;        */
+                    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+                    MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_FLOAT, pr,
+                               SLU_MPI_TAG (4, k0) /* tag */ ,
+                               comm, U_diag_blk_send_req + pr);
+
+                }
+            }
+#if ( PROFlevel>=1 )
+	    TOC (t2, t1);
+	    stat->utime[COMM] += t2;
+	    stat->utime[COMM_DIAG] += t2;
+#endif
+
+	    /* flag outstanding Isend */
+            U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */
+        }
+
+        /* pragma below would be changed by an MKL call */
+
+        l = nsupr - nsupc;
+        // n = nsupc;
+        float alpha = 1.0;
+#ifdef PI_DEBUG
+        printf ("calling strsm\n");
+        printf ("strsm diagonal param 11:  %d \n", nsupr);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+        strsm_ ("R", "U", "N", "N", &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr,
+		1, 1, 1, 1);
+#else
+        strsm_ ("R", "U", "N", "N", &l, &nsupc,
+                &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr);
+#endif
+	stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l;
+    } else {  /* non-diagonal process */
+        /* ================================================================== *
+         * Receive the diagonal block of U for panel factorization of L(:,k). *
+         * Note: we block for panel factorization of L(:,k), but panel        *
+	 * factorization of U(:,k) do not block                               *
+         * ================================================================== */
+
+        /* tag = ((k0<<2)+2) % tag_ub;        */
+        /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+        // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0));
+#if ( PROFlevel>=1 )
+	TIC (t1);
+#endif
+        MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_FLOAT, krow,
+                  SLU_MPI_TAG (4, k0) /* tag */ ,
+                  comm, &status);
+#if ( PROFlevel>=1 )
+	TOC (t2, t1);
+	stat->utime[COMM] += t2;
+	stat->utime[COMM_DIAG] += t2;
+#endif
+        if (nsupr > 0) {
+            float alpha = 1.0;
+
+#ifdef PI_DEBUG
+            printf ("strsm non diagonal param 11:  %d \n", nsupr);
+            if (!lusup)
+                printf (" Rank :%d \t Empty block column occurred :\n", iam);
+#endif
+#if defined (USE_VENDOR_BLAS)
+            strsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1);
+#else
+            strsm_ ("R", "U", "N", "N", &nsupr, &nsupc,
+                    &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr);
+#endif
+	    stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr;
+        }
+
+    } /* end if pkk ... */
+
+    /* printf("exiting psgstrf2 %d \n", grid->iam);  */
+
+}  /* PSGSTRF2_trsm */
+
+
+
+/*****************************************************************************
+ * The following functions are for the new pdgstrf2_strsm in the 3D code.
+ *****************************************************************************/
+static
+int_t LpanelUpdate(int off0,  int nsupc, float* ublk_ptr, int ld_ujrow,
+                   float* lusup, int nsupr, SCT_t* SCT)
+{
+    int_t l = nsupr - off0;
+    float alpha = 1.0;
+    double t1 = SuperLU_timer_();
+
+#define GT  32
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < CEILING(l, GT); ++i)
+    {
+        int_t off = i * GT;
+        int len = SUPERLU_MIN(GT, l - i * GT);
+	
+        superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+		      ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
+
+    } /* for i = ... */
+
+    t1 = SuperLU_timer_() - t1;
+
+    SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc;
+    SCT->trf2_time += t1;
+    SCT->L_PanelUpdate_tl += t1;
+    return 0;
+}
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
+void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
+                   float *BlockUFactor, /*factored U is overwritten here*/
+                   Glu_persist_t *Glu_persist, gridinfo_t *grid, sLocalLU_t *Llu,
+                   SuperLUStat_t *stat, int *info, SCT_t* SCT)
+{
+    //double t1 = SuperLU_timer_();
+    int_t *xsup = Glu_persist->xsup;
+    float alpha = -1, zero = 0.0;
+
+    // printf("Entering dgetrf2 %d \n", k);
+    /* Initialization. */
+    int_t lk = LBj (k, grid);          /* Local block number */
+    int_t jfst = FstBlockC (k);
+    int_t jlst = FstBlockC (k + 1);
+    float *lusup = Llu->Lnzval_bc_ptr[lk];
+    int nsupc = SuperSize (k);
+    int nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+    float *ublk_ptr = BlockUFactor;
+    float *ujrow = BlockUFactor;
+    int_t luptr = 0;                  /* Point_t to the diagonal entries. */
+    int cols_left = nsupc;          /* supernode size */
+    int_t u_diag_cnt = 0;
+    int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    int incx = 1;
+    int incy = ld_ujrow;
+
+    for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
+    {
+        /* Diagonal pivot */
+        int_t i = luptr;
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
+        {
+            if (fabs (lusup[i]) < thresh) {  /* Diagonal */
+
+#if ( PRNTlevel>=2 )
+                    printf ("(%d) .. col %d, tiny pivot %e  ",
+                            iam, jfst + j, lusup[i]);
+#endif
+                /* Keep the new diagonal entry with the same sign. */
+                if (lusup[i] < 0) lusup[i] = -thresh;
+                else lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+                    printf ("replaced by %e\n", lusup[i]);
+#endif
+                ++(stat->TinyPivots);
+            }
+        }
+
+        for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+        {
+            int_t st = j * ld_ujrow + j;
+            ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */
+        }
+
+        if (ujrow[0] == zero)   /* Test for singularity. */
+        {
+            *info = j + jfst + 1;
+        }
+        else                /* Scale the j-th column. */
+        {
+            float temp;
+            temp = 1.0 / ujrow[0];
+            for (int_t i = luptr + 1; i < luptr - j + nsupc; ++i)
+                lusup[i] *= temp;
+            stat->ops[FACT] += nsupc - j - 1;
+        }
+
+        /* Rank-1 update of the trailing submatrix. */
+        if (--cols_left)
+        {
+            /*following must be int*/
+            int l = nsupc - j - 1;
+
+	    /* Rank-1 update */
+            superlu_sger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
+            stat->ops[FACT] += 2 * l * cols_left;
+        }
+
+        ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */
+        luptr += nsupr + 1;           /* move to next column */
+
+    }                       /* for column j ...  first loop */
+
+
+    //int_t thread_id = omp_get_thread_num();
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Sgstrf2 */
+
+#pragma GCC pop_options
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * nsupers (input) int_t (global)
+ *         Number of supernodes.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ * SCT    (output) SCT_t*
+ *        Additional statistics used in the 3D algorithm.
+ *
+ * </pre>
+ */
+void psgstrf2_xtrsm
+(superlu_dist_options_t *options, int_t nsupers,
+ int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist,
+ gridinfo_t *grid, sLocalLU_t *Llu, MPI_Request *U_diag_blk_send_req,
+ int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT)
+{
+    int cols_left, iam, pkk;
+    int incy = 1;
+
+    int nsupr;                  /* number of rows in the block (LDA) */
+    int luptr;
+    int_t myrow, krow, j, jfst, jlst, u_diag_cnt;
+    int_t nsupc;                /* number of columns in the block */
+    int_t *xsup = Glu_persist->xsup;
+    float *lusup;
+    float *ujrow, *ublk_ptr;   /* pointer to the U block */
+    int_t Pr;
+
+    /* Quick return. */
+    *info = 0;
+
+    /* Initialization. */
+    iam = grid->iam;
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    krow = PROW (k, grid);
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    j = LBj (k, grid);          /* Local block number */
+    jfst = FstBlockC (k);
+    jlst = FstBlockC (k + 1);
+    lusup = Llu->Lnzval_bc_ptr[j];
+    nsupc = SuperSize (k);
+    if (Llu->Lrowind_bc_ptr[j])
+        nsupr = Llu->Lrowind_bc_ptr[j][1];
+    else
+        nsupr = 0;
+    ublk_ptr = ujrow = Llu->ujrow;
+
+    luptr = 0;                  /* Point to the diagonal entries. */
+    cols_left = nsupc;          /* supernode size */
+    int ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    u_diag_cnt = 0;
+    incy = ld_ujrow;
+
+    if (U_diag_blk_send_req && U_diag_blk_send_req[myrow])
+    {
+        /* There are pending sends - wait for all Isend to complete */
+        Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT);
+    }
+
+    if (iam == pkk)             /* diagonal process */
+    {
+        /*factorize the diagonal block*/
+        Local_Sgstrf2(options, k, thresh, Llu->ujrow, Glu_persist,
+                      grid, Llu, stat, info, SCT);
+        ublk_ptr = ujrow = Llu->ujrow;
+
+        if (U_diag_blk_send_req && iam == pkk)  /* Send the U block */
+        {
+            sISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req,
+			     grid, tag_ub);
+            U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */
+        }
+
+        LpanelUpdate(nsupc,  nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT);
+    }
+    else                        /* non-diagonal process */
+    {
+        /* ================================================ *
+         * Receive the diagonal block of U                  *
+         * for panel factorization of L(:,k)                *
+         * note: we block for panel factorization of L(:,k) *
+         * but panel factorization of U(:,k) don't          *
+         * ================================================ */
+
+        sRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc),  krow, grid, SCT, tag_ub);
+
+        if (nsupr > 0)
+        {
+            LpanelUpdate(0,  nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT);
+        }
+    } /* end if pkk ... */
+
+} /* psgstrf2_xtrsm */
+
+/*****************************************************************************
+ * The following functions are for the new psgstrs2_omp in the 3D code.
+ *****************************************************************************/
+
+/* PSGSTRS2 helping kernels*/
+
+int_t sTrs2_GatherU(int_t iukp, int_t rukp, int_t klst,
+		    int_t nsupc, int_t ldu,
+		    int_t *usub,
+		    float* uval, float *tempv)
+{
+    double zero = 0.0;
+    int_t ncols = 0;
+    for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+    {
+        int_t segsize = klst - usub[jj];
+        if ( segsize )
+        {
+            int_t lead_zero = ldu - segsize;
+            for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero;
+            tempv += lead_zero;
+            for (int_t i = 0; i < segsize; ++i)
+                tempv[i] = uval[rukp + i];
+            rukp += segsize;
+            tempv += segsize;
+            ncols++;
+        }
+    }
+    return ncols;
+}
+
+int_t sTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
+		     int_t nsupc, int_t ldu,
+		     int_t *usub, float* uval, float *tempv)
+{
+    for (int_t jj = 0; jj < nsupc; ++jj)
+    {
+        int_t segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            int_t lead_zero = ldu - segsize;
+            tempv += lead_zero;
+            for (int i = 0; i < segsize; ++i)
+            {
+                uval[rukp + i] = tempv[i];
+            }
+            tempv += segsize;
+            rukp += segsize;
+        }
+    } /*for jj=0:nsupc */
+    return 0;
+}
+
+int_t sTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
+			      int_t *usub, float *uval, float *tempv,
+			      int_t knsupc, int nsupr, float *lusup,
+			      Glu_persist_t *Glu_persist)    /*glupersist for xsup for supersize*/
+{
+    float alpha = 1.0;
+    int_t *xsup = Glu_persist->xsup;
+    // int_t iukp = Ublock_info.iukp;
+    // int_t rukp = Ublock_info.rukp;
+    int_t gb = usub[iukp];
+    int_t nsupc = SuperSize (gb);
+    iukp += UB_DESCRIPTOR;
+
+    // printf("klst inside task%d\n", );
+    /*find ldu */
+    int ldu = 0;
+    for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+    {
+        ldu = SUPERLU_MAX( klst - usub[jj], ldu) ;
+    }
+
+    /*pack U block into a dense Block*/
+    int ncols = sTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
+    	                           uval, tempv);
+
+    /*now call strsm on packed dense block*/
+    int_t luptr = (knsupc - ldu) * (nsupr + 1);
+    // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
+    
+    superlu_strsm("L", "L", "N", "U", ldu, ncols, alpha,
+		  &lusup[luptr], nsupr, tempv, ldu);
+
+    /*now scatter the output into sparse U block*/
+    sTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
+
+    return 0;
+}
+
+/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+
+#if 1
+
+/*****************************************************************************
+ * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
+ *****************************************************************************/
+void psgstrs2_omp
+(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid,
+ sLocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat)
+{
+#ifdef PI_DEBUG
+    printf("====Entering psgstrs2==== \n");
+#endif
+    int iam, pkk;
+    int incx = 1;
+    int nsupr;                /* number of rows in the block L(:,k) (LDA) */
+    int segsize;
+    int nsupc;                /* number of columns in the block */
+    int_t luptr, iukp, rukp;
+    int_t b, gb, j, klst, knsupc, lk, nb;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *usub;
+    float *lusup, *uval;
+
+#if 0
+    //#ifdef USE_VTUNE
+    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+    __itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+    /* Quick return. */
+    lk = LBi (k, grid);         /* Local block number */
+    if (!Llu->Unzval_br_ptr[lk]) return;
+
+    /* Initialization. */
+    iam = grid->iam;
+    pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    //int k_row_cycle = k / grid->nprow;  /* for which cycle k exist (to assign rowwise thread blocking) */
+    //int gb_col_cycle;  /* cycle through block columns  */
+    klst = FstBlockC (k + 1);
+    knsupc = SuperSize (k);
+    usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+    uval = Llu->Unzval_br_ptr[lk];
+    if (iam == pkk) {
+        lk = LBj (k, grid);
+        nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
+        lusup = Llu->Lnzval_bc_ptr[lk];
+    } else {
+        nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1];   /* LDA of lusup[] */
+        lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)];
+    }
+
+    /////////////////////new-test//////////////////////////
+    /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */
+
+    /* Master thread: set up pointers to each block in the row */
+    nb = usub[0];
+    iukp = BR_HEADER;
+    rukp = 0;
+
+    /* Sherry: can use the existing  Ublock_info[] array, call
+       Trs2_InitUblock_info();                                 */
+#undef USE_Ublock_info
+#ifdef USE_Ublock_info /** 4/19/2019 **/
+    /* Loop through all the row blocks. to get the iukp and rukp*/
+    Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+#else
+    int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int));
+    int* blocks_value_pointers = blocks_index_pointers + nb;
+    int* nsupc_temp = blocks_value_pointers + nb;
+    for (b = 0; b < nb; b++) { /* set up pointers to each block */
+	blocks_index_pointers[b] = iukp + UB_DESCRIPTOR;
+	blocks_value_pointers[b] = rukp;
+	gb = usub[iukp];
+	rukp += usub[iukp+1];
+	nsupc = SuperSize( gb );
+	nsupc_temp[b] = nsupc;
+	iukp += (UB_DESCRIPTOR + nsupc);  /* move to the next block */
+    }
+#endif
+
+    // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
+    // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) default(shared) \
+    private(b,j,iukp,rukp,segsize)
+#endif
+    /* Loop through all the blocks in the row. */
+    for (b = 0; b < nb; ++b) {
+#ifdef USE_Ublock_info
+	iukp = Ublock_info[b].iukp;
+	rukp = Ublock_info[b].rukp;
+#else
+	iukp = blocks_index_pointers[b];
+	rukp = blocks_value_pointers[b];
+#endif
+
+        /* Loop through all the segments in the block. */
+#ifdef USE_Ublock_info
+	gb = usub[iukp];
+	nsupc = SuperSize( gb );
+	iukp += UB_DESCRIPTOR;
+        for (j = 0; j < nsupc; j++) {
+#else	
+        for (j = 0; j < nsupc_temp[b]; j++) {
+#endif
+            segsize = klst - usub[iukp++];
+	    if (segsize) {
+#ifdef _OPENMP
+#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30)
+#endif
+		{ /* Nonzero segment. */
+		    int_t luptr = (knsupc - segsize) * (nsupr + 1);
+		    //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr);
+
+#if defined (USE_VENDOR_BLAS)
+                    strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx, 1, 1, 1);
+#else
+                    strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
+                            &uval[rukp], &incx);
+#endif
+		} /* end task */
+		rukp += segsize;
+#ifndef USE_Ublock_info
+		stat->ops[FACT] += segsize * (segsize + 1);
+#endif
+	    } /* end if segsize > 0 */
+	} /* end for j in parallel ... */
+#ifdef _OPENMP    
+/* #pragma omp taskwait */
+#endif
+    }  /* end for b ... */
+
+#ifndef USE_Ublock_info
+    /* Deallocate memory */
+    SUPERLU_FREE(blocks_index_pointers);
+#endif
+
+#if 0
+    //#ifdef USE_VTUNE
+    __itt_pause(); // stop VTune
+    __SSC_MARK(0x222); // stop SDE tracing
+#endif
+
+} /* psgstrs2_omp */
+
+#else  /*==== new version from Piyush ====*/
+
+void psgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, 
+		  float *Lval_buf, Glu_persist_t *Glu_persist,
+		  gridinfo_t *grid, sLocalLU_t *Llu, SuperLUStat_t *stat,
+		  Ublock_info_t *Ublock_info, float *bigV, int_t ldt, SCT_t *SCT)
+{
+    double t1 = SuperLU_timer_();
+    int_t *xsup = Glu_persist->xsup;
+    /* Quick return. */
+    int_t lk = LBi (k, grid);         /* Local block number */
+
+    if (!Llu->Unzval_br_ptr[lk]) return;
+
+    /* Initialization. */
+    int_t klst = FstBlockC (k + 1);
+    int_t knsupc = SuperSize (k);
+    int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+    float *uval = Llu->Unzval_br_ptr[lk];
+    int_t nb = usub[0];
+
+    int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+    float *lusup = Lval_buf;
+
+    /* Loop through all the row blocks. to get the iukp and rukp*/
+    Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+    /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic,2)
+#endif
+    for (int_t b = 0; b < nb; ++b)
+    {
+#ifdef _OPENMP    
+        int thread_id = omp_get_thread_num();
+#else	
+        int thread_id = 0;
+#endif	
+        float *tempv = bigV +  thread_id * ldt * ldt;
+        sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
+    } /* for b ... */
+
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
+} /* pdgstrs2_omp new version from Piyush */
+
+#endif /* there are 2 versions of psgstrs2_omp */
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
new file mode 100644
index 00000000..df3bd986
--- /dev/null
+++ b/SRC/psgstrf3d.c
@@ -0,0 +1,392 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "slustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
+		trf3Dpartition_t*  trf3Dpartition, SCT_t *SCT,
+		sLUstruct_t *LUstruct, gridinfo3d_t * grid3d,
+		SuperLUStat_t *stat, int *info)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    // problem specific contants
+    int_t ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    //    double s_eps = slamch_ ("Epsilon");  -Sherry
+    double s_eps = smach_dist("Epsilon");
+    double thresh = s_eps * anorm;
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter psgstrf3d()");
+#endif
+
+    // Initilize stat
+    stat->ops[FACT] = 0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+    //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
+
+    //getting Nsupers
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+
+    // Grid related Variables
+    int_t iam = grid->iam; // in 2D grid
+    int num_threads = getNumThreads(grid3d->iam);
+
+    factStat_t factStat;
+    initFactStat(nsupers, &factStat);
+
+#if 0  // sherry: not used
+    sdiagFactBufs_t dFBuf;
+    sinitDiagFactBufs(ldt, &dFBuf);
+
+    commRequests_t comReqs;   
+    initCommRequests(&comReqs, grid);
+
+    msgs_t msgs;
+    initMsgs(&msgs);
+#endif
+
+    SCT->tStartup = SuperLU_timer_();
+    packLUInfo_t packLUInfo;
+    initPackLUInfo(nsupers, &packLUInfo);
+
+    sscuBufs_t scuBufs;
+    sinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid);
+
+    factNodelists_t  fNlists;
+    initFactNodelists( ldt, num_threads, nsupers, &fNlists);
+
+    // tag_ub initialization
+    int tag_ub = set_tag_ub();
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+#if ( PRNTlevel>=1 )
+    if (grid3d->iam == 0) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
+    // trf3Dpartition_t*  trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+    gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
+    int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno;
+    int_t* myNodeCount = trf3Dpartition->myNodeCount;
+    int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    int_t** treePerm = trf3Dpartition->treePerm ;
+    sLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+
+    /* Initializing factorization specific buffers */
+
+    int_t numLA = getNumLookAhead(options);
+    sLUValSubBuf_t** LUvsbs = sLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
+    msgs_t**msgss = initMsgsArr(numLA);
+    int_t mxLeafNode    = 0;
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
+        if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
+            mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
+    }
+    sdiagFactBufs_t** dFBufs = sinitDiagFactBufsArr(mxLeafNode, ldt, grid);
+    commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid);
+
+    /* Setting up GPU related data structures */
+
+    int_t first_l_block_acc = 0;
+    int_t first_u_block_acc = 0;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
+
+    sInit_HyP(HyP, Llu, mcb, mrb);
+    HyP->first_l_block_acc = first_l_block_acc;
+    HyP->first_u_block_acc = first_u_block_acc;
+
+    int superlu_acc_offload = HyP->superlu_acc_offload;
+
+    //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t bigu_size = getBigUSize(nsupers, grid,
+    	  	                  LUstruct->Llu->Lrowind_bc_ptr);
+    HyP->bigu_size = bigu_size;
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
+    HyP->buffer_size = buffer_size;
+    HyP->nsupers = nsupers;
+
+#ifdef GPU_ACC
+
+    /*Now initialize the GPU data structure*/
+    sLUstruct_gpu_t *A_gpu, *dA_gpu;
+
+    d2Hreduce_t d2HredObj;
+    d2Hreduce_t* d2Hred = &d2HredObj;
+    ssluGPU_t sluGPUobj;
+    ssluGPU_t *sluGPU = &sluGPUobj;
+    sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
+    if (superlu_acc_offload)
+    {
+#if 0 	/* Sherry: For GPU code on titan, we do not need performance 
+	   lookup tables since due to difference in CPU-GPU performance,
+	   it didn't make much sense to do any Schur-complement update
+	   on CPU, except for the lookahead-update on CPU. Same should
+	   hold for summit as well. (from Piyush)   */
+
+        /*Initilize the lookup tables */
+        LookUpTableInit(iam);
+        acc_async_cost = get_acc_async_cost();
+#ifdef GPU_DEBUG
+        if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost);
+#endif
+#endif
+
+	//OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+	int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+					      LUstruct->etree,
+					      LUstruct->Glu_persist,
+					      LUstruct->Llu->Lrowind_bc_ptr,
+					      LUstruct->Llu->Ufstnz_br_ptr,
+					      grid);
+
+	/* Initialize GPU data structures */
+        sinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno,
+                        n, buffer_size, bigu_size, ldt);
+
+        HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu;
+        HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu;
+        HyP->nGPUStreams = sluGPU->nGPUStreams;
+    }
+
+#endif  // end GPU_ACC
+
+    /*====  starting main factorization loop =====*/
+    MPI_Barrier( grid3d->comm);
+    SCT->tStartup = SuperLU_timer_() - SCT->tStartup;
+    // int_t myGrid = grid3d->zscp.Iam;
+
+#ifdef ITAC_PROF
+    VT_traceon();
+#endif
+#ifdef MAP_PROFILE
+    allinea_start_sampling();
+#endif
+    SCT->pdgstrfTimer = SuperLU_timer_();
+
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
+    {
+        /* if I participate in this level */
+        if (!myZeroTrIdxs[ilvl])
+        {
+            //int_t tree = myTreeIdxs[ilvl];
+
+            sForest_t* sforest = sForests[myTreeIdxs[ilvl]];
+
+            /* main loop over all the supernodes */
+            if (sforest) /* 2D factorization at individual subtree */
+            {
+                double tilvl = SuperLU_timer_();
+#ifdef GPU_ACC
+                ssparseTreeFactor_ASYNC_GPU(
+                    sforest,
+                    comReqss, &scuBufs,  &packLUInfo,
+                    msgss, LUvsbs, dFBufs,  &factStat, &fNlists,
+                    &gEtreeInfo, options,  iperm_c_supno, ldt,
+                    sluGPU,  d2Hred,  HyP, LUstruct, grid3d, stat,
+                    thresh,  SCT, tag_ub, info);
+#else
+                ssparseTreeFactor_ASYNC(sforest, comReqss,  &scuBufs, &packLUInfo,
+					msgss, LUvsbs, dFBufs, &factStat, &fNlists,
+					&gEtreeInfo, options, iperm_c_supno, ldt,
+					HyP, LUstruct, grid3d, stat,
+					thresh,  SCT, tag_ub, info );
+#endif
+
+                /*now reduce the updates*/
+                SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
+                sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
+            }
+
+            if (ilvl < maxLvl - 1)     /*then reduce before factorization*/
+            {
+#ifdef GPU_ACC
+                sreduceAllAncestors3d_GPU(
+                    ilvl, myNodeCount, treePerm, LUvsb,
+                    LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP,
+                    SCT );
+#else
+
+                sreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+                                      LUvsb, LUstruct, grid3d, SCT );
+#endif
+
+            }
+        } /*if (!myZeroTrIdxs[ilvl])  ... If I participate in this level*/
+
+        SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
+	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
+
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    if (superlu_acc_offload) {
+         sfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
+#endif
+    
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
+    SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
+
+#ifdef ITAC_PROF
+    VT_traceoff();
+#endif
+
+#ifdef MAP_PROFILE
+    allinea_stop_sampling();
+#endif
+
+    reduceStat(FACT, stat, grid3d);
+
+    // sherry added
+    /* Deallocate factorization specific buffers */
+    freePackLUInfo(&packLUInfo);
+    sfreeScuBufs(&scuBufs);
+    freeFactStat(&factStat);
+    freeFactNodelists(&fNlists);
+    freeMsgsArr(numLA, msgss);
+    freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss);
+    sLluBufFreeArr(numLA, LUvsbs);
+    sfreeDiagFactBufsArr(mxLeafNode, dFBufs);
+    Free_HyP(HyP);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit psgstrf3d()");
+#endif
+    return 0;
+
+} /* psgstrf3d */
diff --git a/SRC/psgstrs.c b/SRC/psgstrs.c
new file mode 100644
index 00000000..3658bd6c
--- /dev/null
+++ b/SRC/psgstrs.c
@@ -0,0 +1,2400 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of distributed linear equations A*X = B with a
+ * general N-by-N matrix A using the LU factors computed previously.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+#ifndef CACHELINE
+#define CACHELINE 64  /* bytes, Xeon Phi KNL, Cori haswell, Edision */
+#endif
+
+/*
+ * Sketch of the algorithm for L-solve:
+ * =======================
+ *
+ * Self-scheduling loop:
+ *
+ *   while ( not finished ) { .. use message counter to control
+ *
+ *      reveive a message;
+ *
+ * 	if ( message is Xk ) {
+ * 	    perform local block modifications into lsum[];
+ *                 lsum[i] -= L_i,k * X[k]
+ *          if all local updates done, Isend lsum[] to diagonal process;
+ *
+ *      } else if ( message is LSUM ) { .. this must be a diagonal process
+ *          accumulate LSUM;
+ *          if ( all LSUM are received ) {
+ *              perform triangular solve for Xi;
+ *              Isend Xi down to the current process column;
+ *              perform local block modifications into lsum[];
+ *          }
+ *      }
+ *   }
+ *
+ *
+ * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array)
+ * =======================
+ *
+ * lsum[] array (local)
+ *   + lsum has "nrhs" columns, row-wise is partitioned by supernodes
+ *   + stored by row blocks, column wise storage within a row block
+ *   + prepend a header recording the global block number.
+ *
+ *         lsum[]                        ilsum[nsupers + 1]
+ *
+ *         -----
+ *         | | |  <- header of size 2     ---
+ *         --------- <--------------------| |
+ *         | | | | |			  ---
+ * 	   | | | | |	      |-----------| |
+ *         | | | | | 	      |           ---
+ *	   ---------          |   |-------| |
+ *         | | |  <- header   |   |       ---
+ *         --------- <--------|   |  |----| |
+ *         | | | | |		  |  |    ---
+ * 	   | | | | |              |  |
+ *         | | | | |              |  |
+ *	   ---------              |  |
+ *         | | |  <- header       |  |
+ *         --------- <------------|  |
+ *         | | | | |                 |
+ * 	   | | | | |                 |
+ *         | | | | |                 |
+ *	   --------- <---------------|
+ */
+
+/*#define ISEND_IRECV*/
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*,
+		   float*, int*, float*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ *
+ * B      (input) float*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) float*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * </pre>
+ */
+
+int_t
+psReDistribute_B_to_X(float *B, int_t m_loc, int nrhs, int_t ldb,
+                      int_t fst_row, int_t *ilsum, float *x,
+		      sScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist,
+		      gridinfo_t *grid, sSOLVEstruct_t *SOLVEstruct)
+{
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *perm_r, *perm_c; /* row and column permutation vectors */
+    int_t  *send_ibuf, *recv_ibuf;
+    float *send_dbuf, *recv_dbuf;
+    int_t  *xsup, *supno;
+    int_t  i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow;
+    int    p, procs;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+	MPI_Request req_i, req_d, *req_send, *req_recv;
+	MPI_Status status, *status_send, *status_recv;
+	int Nreq_recv, Nreq_send, pp, pps, ppr;
+	double t;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psReDistribute_B_to_X()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    perm_r = ScalePermstruct->perm_r;
+    perm_c = ScalePermstruct->perm_c;
+    procs = grid->nprow * grid->npcol;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    SendCnt      = gstrs_comm->B_to_X_SendCnt;
+    SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->B_to_X_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->B_to_X_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->B_to_X_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+    /* ------------------------------------------------------------
+       NOW COMMUNICATE THE ACTUAL DATA.
+       ------------------------------------------------------------*/
+
+	if(procs==1){ // faster memory copy when procs=1
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+#endif
+	{
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	{
+		// t = SuperLU_timer_();
+#ifdef _OPENMP
+#pragma	omp	taskloop private (i,l,irow,k,j,knsupc) untied
+#endif
+		for (i = 0; i < m_loc; ++i) {
+			irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
+
+			k = BlockNum( irow );
+			knsupc = SuperSize( k );
+			l = X_BLK( k );
+
+			x[l - XK_H] = k;      /* Block number prepended in the header. */
+
+			irow = irow - FstBlockC(k); /* Relative row number in X-block */
+			RHS_ITERATE(j) {
+			x[l + irow + j*knsupc] = B[i + j*ldb];
+			}
+		}
+	}
+	}
+	}else{
+		k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+		l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+		if ( !(send_ibuf = intMalloc_dist(k + l)) )
+			ABORT("Malloc fails for send_ibuf[].");
+		recv_ibuf = send_ibuf + k;
+		if ( !(send_dbuf = floatMalloc_dist((k + l)* (size_t)nrhs)) )
+			ABORT("Malloc fails for send_dbuf[].");
+		recv_dbuf = send_dbuf + k * nrhs;
+		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
+			ABORT("Malloc fails for req_send[].");
+		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
+			ABORT("Malloc fails for req_recv[].");
+		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
+			ABORT("Malloc fails for status_send[].");
+		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
+			ABORT("Malloc fails for status_recv[].");
+
+		for (p = 0; p < procs; ++p) {
+			ptr_to_ibuf[p] = sdispls[p];
+			ptr_to_dbuf[p] = sdispls[p] * nrhs;
+		}
+
+		/* Copy the row indices and values to the send buffer. */
+		// t = SuperLU_timer_();
+		for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+			irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+		gbi = BlockNum( irow );
+		p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+		k = ptr_to_ibuf[p];
+		send_ibuf[k] = irow;
+		++ptr_to_ibuf[p];
+
+		k = ptr_to_dbuf[p];
+		RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+			send_dbuf[k++] = B[i + j*ldb];
+		}
+		ptr_to_dbuf[p] += nrhs;
+		}
+
+		// t = SuperLU_timer_() - t;
+		// printf(".. copy to send buffer time\t%8.4f\n", t);
+
+#if 0
+	#if 1
+		/* Communicate the (permuted) row indices. */
+		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+			  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+ 		/* Communicate the numerical values. */
+		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT,
+			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT,
+			  grid->comm);
+	#else
+ 		/* Communicate the (permuted) row indices. */
+		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i);
+ 		/* Communicate the numerical values. */
+		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT,
+				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT,
+				grid->comm, &req_d);
+		MPI_Wait(&req_i,&status);
+		MPI_Wait(&req_d,&status);
+ 	#endif
+#endif
+	MPI_Barrier( grid->comm );
+
+
+	Nreq_send=0;
+	Nreq_recv=0;
+	for (pp=0;pp<procs;pp++){
+		pps = grid->iam+1+pp;
+		if(pps>=procs)pps-=procs;
+		if(pps<0)pps+=procs;
+		ppr = grid->iam-1+pp;
+		if(ppr>=procs)ppr-=procs;
+		if(ppr<0)ppr+=procs;
+
+		if(SendCnt[pps]>0){
+			MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm,
+			&req_send[Nreq_send] );
+			Nreq_send++;
+		}
+		if(RecvCnt[ppr]>0){
+			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
+			&req_recv[Nreq_recv] );
+			Nreq_recv++;
+		}
+	}
+
+
+	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
+	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
+
+
+	Nreq_send=0;
+	Nreq_recv=0;
+	for (pp=0;pp<procs;pp++){
+		pps = grid->iam+1+pp;
+		if(pps>=procs)pps-=procs;
+		if(pps<0)pps+=procs;
+		ppr = grid->iam-1+pp;
+		if(ppr>=procs)ppr-=procs;
+		if(ppr<0)ppr+=procs;
+		if(SendCnt_nrhs[pps]>0){
+			MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm,
+			&req_send[Nreq_send] );
+			Nreq_send++;
+		}
+		if(RecvCnt_nrhs[ppr]>0){
+			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm,
+			&req_recv[Nreq_recv] );
+			Nreq_recv++;
+		}
+	}
+
+	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
+	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
+
+
+		/* ------------------------------------------------------------
+		   Copy buffer into X on the diagonal processes.
+		   ------------------------------------------------------------*/
+
+		// t = SuperLU_timer_();
+		ii = 0;
+		for (p = 0; p < procs; ++p) {
+			jj = rdispls_nrhs[p];
+			for (i = 0; i < RecvCnt[p]; ++i) {
+			/* Only the diagonal processes do this; the off-diagonal processes
+			   have 0 RecvCnt. */
+			irow = recv_ibuf[ii]; /* The permuted row index. */
+			k = BlockNum( irow );
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid );  /* Local block number. */
+			l = X_BLK( lk );
+			x[l - XK_H] = k;      /* Block number prepended in the header. */
+
+			irow = irow - FstBlockC(k); /* Relative row number in X-block */
+			RHS_ITERATE(j) {
+				x[l + irow + j*knsupc] = recv_dbuf[jj++];
+			}
+			++ii;
+		}
+		}
+
+		// t = SuperLU_timer_() - t;
+		// printf(".. copy to x time\t%8.4f\n", t);
+
+		SUPERLU_FREE(send_ibuf);
+		SUPERLU_FREE(send_dbuf);
+		SUPERLU_FREE(req_send);
+		SUPERLU_FREE(req_recv);
+		SUPERLU_FREE(status_send);
+		SUPERLU_FREE(status_recv);
+	}
+
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit psReDistribute_B_to_X()");
+#endif
+    return 0;
+} /* psReDistribute_B_to_X */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * </pre>
+ */
+
+int_t
+psReDistribute_X_to_B(int_t n, float *B, int_t m_loc, int_t ldb, int_t fst_row,
+		      int_t nrhs, float *x, int_t *ilsum,
+		      sScalePermstruct_t *ScalePermstruct,
+		      Glu_persist_t *Glu_persist, gridinfo_t *grid,
+		      sSOLVEstruct_t *SOLVEstruct)
+{
+    int_t  i, ii, irow, j, jj, k, knsupc, nsupers, l, lk;
+    int_t  *xsup, *supno;
+    int  *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int  *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs;
+    int  *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t  *send_ibuf, *recv_ibuf;
+    float *send_dbuf, *recv_dbuf;
+    int_t  *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+    int  iam, p, q, pkk, procs;
+    int_t  num_diag_procs, *diag_procs;
+	MPI_Request req_i, req_d, *req_send, *req_recv;
+	MPI_Status status, *status_send, *status_recv;
+	int Nreq_recv, Nreq_send, pp,pps,ppr;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psReDistribute_X_to_B()");
+#endif
+
+    /* ------------------------------------------------------------
+       INITIALIZATION.
+       ------------------------------------------------------------*/
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    iam = grid->iam;
+    procs = grid->nprow * grid->npcol;
+
+    SendCnt      = gstrs_comm->X_to_B_SendCnt;
+    SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt +   procs;
+    RecvCnt      = gstrs_comm->X_to_B_SendCnt + 2*procs;
+    RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs;
+    sdispls      = gstrs_comm->X_to_B_SendCnt + 4*procs;
+    sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs;
+    rdispls      = gstrs_comm->X_to_B_SendCnt + 6*procs;
+    rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs;
+    ptr_to_ibuf  = gstrs_comm->ptr_to_ibuf;
+    ptr_to_dbuf  = gstrs_comm->ptr_to_dbuf;
+
+
+	if(procs==1){ //faster memory copy when procs=1
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+#endif
+	{
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	{
+		// t = SuperLU_timer_();
+#ifdef _OPENMP
+#pragma	omp	taskloop private (k,knsupc,lk,irow,l,i,j) untied
+#endif
+		for (k = 0; k < nsupers; k++) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid ); /* Local block number */
+		irow = FstBlockC( k );
+		l = X_BLK( lk );
+		for (i = 0; i < knsupc; ++i) {
+			RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+				B[irow-fst_row +i + j*ldb] = x[l + i + j*knsupc];
+			}
+			}
+		}
+	}
+	}
+	}else{
+		k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */
+		l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */
+		if ( !(send_ibuf = intMalloc_dist(k + l)) )
+			ABORT("Malloc fails for send_ibuf[].");
+		recv_ibuf = send_ibuf + k;
+		if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) )
+			ABORT("Malloc fails for send_dbuf[].");
+		if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
+			ABORT("Malloc fails for req_send[].");
+		if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) )
+			ABORT("Malloc fails for req_recv[].");
+		if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
+			ABORT("Malloc fails for status_send[].");
+		if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) )
+			ABORT("Malloc fails for status_recv[].");
+		recv_dbuf = send_dbuf + k * nrhs;
+		for (p = 0; p < procs; ++p) {
+			ptr_to_ibuf[p] = sdispls[p];
+			ptr_to_dbuf[p] = sdispls_nrhs[p];
+		}
+		num_diag_procs = SOLVEstruct->num_diag_procs;
+		diag_procs = SOLVEstruct->diag_procs;
+ 		for (p = 0; p < num_diag_procs; ++p) {  /* For all diagonal processes. */
+		pkk = diag_procs[p];
+		if ( iam == pkk ) {
+			for (k = p; k < nsupers; k += num_diag_procs) {
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid ); /* Local block number */
+			irow = FstBlockC( k );
+			l = X_BLK( lk );
+			for (i = 0; i < knsupc; ++i) {
+	#if 0
+				ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */
+	#else
+				ii = irow;
+	#endif
+				q = row_to_proc[ii];
+				jj = ptr_to_ibuf[q];
+				send_ibuf[jj] = ii;
+				jj = ptr_to_dbuf[q];
+				RHS_ITERATE(j) { /* RHS stored in row major in buffer. */
+					send_dbuf[jj++] = x[l + i + j*knsupc];
+				}
+				++ptr_to_ibuf[q];
+				ptr_to_dbuf[q] += nrhs;
+				++irow;
+			}
+			}
+		}
+		}
+
+		/* ------------------------------------------------------------
+			COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES.
+		   ------------------------------------------------------------*/
+#if 0
+	#if 1
+		MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+			  recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm);
+		MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,MPI_FLOAT,
+			  recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT,
+			  grid->comm);
+	#else
+		MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t,
+				recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i);
+		MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT,
+				recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT,
+				grid->comm,&req_d);
+ 		MPI_Wait(&req_i,&status);
+		MPI_Wait(&req_d,&status);
+	#endif
+#endif
+
+	MPI_Barrier( grid->comm );
+	Nreq_send=0;
+	Nreq_recv=0;
+	for (pp=0;pp<procs;pp++){
+		pps = grid->iam+1+pp;
+		if(pps>=procs)pps-=procs;
+		if(pps<0)pps+=procs;
+		ppr = grid->iam-1+pp;
+		if(ppr>=procs)ppr-=procs;
+		if(ppr<0)ppr+=procs;
+		if(SendCnt[pps]>0){
+			MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm,
+			&req_send[Nreq_send] );
+			Nreq_send++;
+		}
+		if(RecvCnt[ppr]>0){
+			MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm,
+			&req_recv[Nreq_recv] );
+			Nreq_recv++;
+		}
+	}
+
+
+	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
+	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
+	// MPI_Barrier( grid->comm );
+
+	Nreq_send=0;
+	Nreq_recv=0;
+	for (pp=0;pp<procs;pp++){
+		pps = grid->iam+1+pp;
+		if(pps>=procs)pps-=procs;
+		if(pps<0)pps+=procs;
+		ppr = grid->iam-1+pp;
+		if(ppr>=procs)ppr-=procs;
+		if(ppr<0)ppr+=procs;
+		if(SendCnt_nrhs[pps]>0){
+			MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm,
+			&req_send[Nreq_send] );
+			Nreq_send++;
+		}
+		if(RecvCnt_nrhs[ppr]>0){
+			MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm,
+			&req_recv[Nreq_recv] );
+			Nreq_recv++;
+		}
+	}
+
+
+	if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send);
+	if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv);
+	// MPI_Barrier( grid->comm );
+
+
+		/* ------------------------------------------------------------
+		   COPY THE BUFFER INTO B.
+		   ------------------------------------------------------------*/
+		for (i = 0, k = 0; i < m_loc; ++i) {
+		irow = recv_ibuf[i];
+		irow -= fst_row; /* Relative row number */
+		RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */
+			B[irow + j*ldb] = recv_dbuf[k++];
+		}
+		}
+
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+	SUPERLU_FREE(req_send);
+	SUPERLU_FREE(req_recv);
+	SUPERLU_FREE(status_send);
+	SUPERLU_FREE(status_recv);
+}
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit psReDistribute_X_to_B()");
+#endif
+    return 0;
+
+} /* psReDistribute_X_to_B */
+
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Compute the inverse of the diagonal blocks of the L and U
+ *   triangular matrices.
+ * </pre>
+ */
+void
+psCompute_Diag_Inv(int_t n, sLUstruct_t *LUstruct,gridinfo_t *grid,
+                   SuperLUStat_t *stat, int *info)
+{
+#ifdef SLU_HAVE_LAPACK
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    float *lusup;
+    float *recvbuf, *tempv;
+    float *Linv;/* Inverse of diagonal block */
+    float *Uinv;/* Inverse of diagonal block */
+
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb,nlb_nodiag, nub, nsupers;
+    int_t  *xsup, *supno, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    float **Lnzval_bc_ptr;
+    float **Linv_bc_ptr;
+    float **Uinv_bc_ptr;
+    int INFO;
+    double t;
+
+    float one = 1.0;
+    float zero = 0.0;
+
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_();
+#endif
+
+#if ( PRNTlevel>=2 )
+    if ( grid->iam==0 ) {
+	printf("computing inverse of diagonal blocks...\n");
+	fflush(stdout);
+    }
+#endif
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = supno[n-1] + 1;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Linv_bc_ptr = Llu->Linv_bc_ptr;
+    Uinv_bc_ptr = Llu->Uinv_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+    Llu->inv = 1;
+
+    /*---------------------------------------------------
+     * Compute inverse of L(lk,lk).
+     *---------------------------------------------------*/
+
+     for (k = 0; k < nsupers; ++k) {
+         krow = PROW( k, grid );
+	 if ( myrow == krow ) {
+	     lk = LBi( k, grid );    /* local block number */
+	     kcol = PCOL( k, grid );
+	     if ( mycol == kcol ) { /* diagonal process */
+
+	     	  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  Linv = Linv_bc_ptr[lk];
+		  Uinv = Uinv_bc_ptr[lk];
+		  nsupr = lsub[1];
+		  knsupc = SuperSize( k );
+
+		  for (j=0 ; j<knsupc; j++){
+		      for (i=0 ; i<knsupc; i++){
+		  	  Linv[j*knsupc+i] = zero;
+			  Uinv[j*knsupc+i] = zero;
+		      }
+	          }
+
+	   	  for (j=0 ; j<knsupc; j++){
+		      Linv[j*knsupc+j] = one;
+		      for (i=j+1 ; i<knsupc; i++){
+		          Linv[j*knsupc+i] = lusup[j*nsupr+i];
+		      }
+		      for (i=0 ; i<j+1; i++){
+			  Uinv[j*knsupc+i] = lusup[j*nsupr+i];
+	              }
+ 		  }
+
+		  /* Triangular inversion */
+   		  strtri_("L","U",&knsupc,Linv,&knsupc,&INFO);
+
+		  strtri_("U","N",&knsupc,Uinv,&knsupc,&INFO);
+
+	    } /* end if (mycol === kcol) */
+	} /* end if (myrow === krow) */
+    } /* end fo k = ... nsupers */
+
+#if ( PROFlevel>=1 )
+    if( grid->iam==0 ) {
+	t = SuperLU_timer_() - t;
+	printf(".. L-diag_inv time\t%10.5f\n", t);
+	fflush(stdout);
+    }
+#endif
+
+    return;
+#endif /* SLU_HAVE_LAPACK */
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSTRS solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by PSGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PSGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>
+ */
+
+void
+psgstrs(int_t n, sLUstruct_t *LUstruct,
+	sScalePermstruct_t *ScalePermstruct,
+	gridinfo_t *grid, float *B,
+	int_t m_loc, int_t fst_row, int_t ldb, int nrhs,
+	sSOLVEstruct_t *SOLVEstruct,
+	SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    float alpha = 1.0;
+    float beta = 0.0;
+    float zero = 0.0;
+    float *lsum;  /* Local running sum of the updates to B-components */
+    float *x;     /* X component at step k. */
+		    /* NOTE: x and lsum are of same size. */
+    float *lusup, *dest;
+    float *recvbuf, *recvbuf_on, *tempv,
+            *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin;
+    float *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */
+    float *Linv; /* Inverse of diagonal block */
+    float *Uinv; /* Inverse of diagonal block */
+    int *ipiv;
+    int_t *leaf_send;
+    int_t nleaf_send, nleaf_send_tmp;
+    int_t *root_send;
+    int_t nroot_send, nroot_send_tmp;
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+        /*-- Data structures used for broadcast and reduction trees. --*/
+    BcTree  *LBtree_ptr = Llu->LBtree_ptr;
+    RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+    BcTree  *UBtree_ptr = Llu->UBtree_ptr;
+    RdTree  *URtree_ptr = Llu->URtree_ptr;
+    int_t  *Urbs1; /* Number of row blocks in each block column of U. */
+    int_t  *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr = Llu->Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, kk, lb, ljb, lk, lib, lptr, luptr, gb, nn;
+    int_t  nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i,maxsuper;
+    int_t  *xsup, *supno, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr, nprobe;
+    int    nbtree, nrtree, outcount;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    float **Lnzval_bc_ptr;
+    float **Linv_bc_ptr;
+    float **Uinv_bc_ptr;
+    float sum;
+    MPI_Status status,status_on,statusx,statuslsum;
+    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+    SuperLUStat_t **stat_loc;
+
+    double tmax;
+    	/*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve --
+    			 Count the number of local block products to
+    			 be summed into lsum[lk]. */
+    int_t fmod_tmp;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  nfrecvx_buf=0;
+    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+    			     from processes in this row.
+    			     It is only valid on the diagonal processes. */
+    int_t  frecv_tmp;
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nfrecv = 0; /* Count of total messages to be recv'd. */
+    int_t  nbrecv = 0; /* Count of total messages to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+    int_t  nleaftmp = 0, nroottmp = 0;
+    int_t  msgsize;
+        /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for U-solve. */
+    int_t  bmod_tmp;
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  nbrecvx_buf=0;
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+    			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t flagx,flaglsum,flag;
+    int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups;
+    int_t TAG;
+    double t1_sol, t2_sol, t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t gik,iklrow,fnz;
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int INFO, pad;
+    int_t tmpresult;
+
+    // #if ( PROFlevel>=1 )
+    double t1, t2;
+    float msg_vol = 0, msg_cnt = 0;
+    // #endif
+
+    int_t msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
+		      *     0 : transferred in Lsub_buf[]
+		      *     1 : transferred in Lval_buf[]
+		      *     2 : transferred in Usub_buf[]
+		      *     3 : transferred in Uval_buf[]
+		      */
+    int iword = sizeof (int_t);
+    int dword = sizeof (float);
+    int Nwork;
+    int_t procs = grid->nprow * grid->npcol;
+    yes_no_t done;
+    yes_no_t startforward;
+    int nbrow;
+    int_t  ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc;
+    int_t  lptr1_tmp, idx_i, idx_v,m;
+    int_t ready;
+    int thread_id = 0;
+    yes_no_t empty;
+    int_t sizelsum,sizertemp,aln_d,aln_i;
+    aln_d = ceil(CACHELINE/(double)dword);
+    aln_i = ceil(CACHELINE/(double)iword);
+    int num_thread = 1;
+
+    maxsuper = sp_ienv_dist(3);
+
+//#ifdef _OPENMP
+//#pragma omp threadprivate(thread_id)
+//#endif
+
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    {
+    	if (omp_get_thread_num () == 0) {
+    		num_thread = omp_get_num_threads ();
+    	}
+    }
+#else
+	num_thread=1;
+#endif
+
+#if ( PRNTlevel>=1 )
+    if( grid->iam==0 ) {
+	printf("num_thread: %5d\n", num_thread);
+	fflush(stdout);
+    }
+#endif
+
+    MPI_Barrier( grid->comm );
+    t1_sol = SuperLU_timer_();
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PSGSTRS", grid, -*info);
+	return;
+    }
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = supno[n-1] + 1;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    Linv_bc_ptr = Llu->Linv_bc_ptr;
+    Uinv_bc_ptr = Llu->Uinv_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+
+    stat->utime[SOL_COMM] = 0.0;
+    stat->utime[SOL_GEMM] = 0.0;
+    stat->utime[SOL_TRSM] = 0.0;
+    stat->utime[SOL_TOT] = 0.0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgstrs()");
+#endif
+
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS. */
+    if ( !(fmod = intMalloc_dist(nlb*aln_i)) )
+	ABORT("Malloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i];
+    if ( !(frecv = intCalloc_dist(nlb)) )
+	ABORT("Calloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+    if ( !(leaf_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) )
+	ABORT("Malloc fails for leaf_send[].");
+    nleaf_send=0;
+    if ( !(root_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) )
+	ABORT("Malloc fails for root_send[].");
+    nroot_send=0;
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H);
+    sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d;
+
+#ifdef _OPENMP
+    if ( !(lsum = (float*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(float))))
+	ABORT("Malloc fails for lsum[].");
+#pragma omp parallel default(shared) private(ii,thread_id)
+    {
+	thread_id = omp_get_thread_num(); //mjc
+	for (ii=0; ii<sizelsum; ii++)
+    	    lsum[thread_id*sizelsum+ii]=zero;
+    }
+#else
+    if ( !(lsum = (float*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(float))))
+  	    ABORT("Malloc fails for lsum[].");
+    for ( ii=0; ii < sizelsum*num_thread; ii++ )
+	lsum[ii]=zero;
+#endif
+    /* intermediate solution x[] vector has same structure as lsum[], see leading comment */
+    if ( !(x = floatCalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
+	ABORT("Calloc fails for x[].");
+
+    sizertemp=ldalsum * nrhs;
+    sizertemp = ((sizertemp + (aln_d - 1)) / aln_d) * aln_d;
+    if ( !(rtemp = (float*)SUPERLU_MALLOC((sizertemp*num_thread + 1) * sizeof(float))) )
+	ABORT("Malloc fails for rtemp[].");
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(ii,thread_id)
+    {
+	thread_id=omp_get_thread_num();
+	for ( ii=0; ii<sizertemp; ii++ )
+		rtemp[thread_id*sizertemp+ii]=zero;
+    }
+#else
+    for ( ii=0; ii<sizertemp*num_thread; ii++ )
+	rtemp[ii]=zero;
+#endif
+
+    if ( !(stat_loc = (SuperLUStat_t**) SUPERLU_MALLOC(num_thread*sizeof(SuperLUStat_t*))) )
+	ABORT("Malloc fails for stat_loc[].");
+
+    for ( i=0; i<num_thread; i++) {
+	stat_loc[i] = (SuperLUStat_t*)SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+	PStatInit(stat_loc[i]);
+    }
+
+#if ( DEBUGlevel>=2 )
+    /* Dump the L factor using matlab triple-let format. */
+    sDumpLblocks(iam, nsupers, grid, Glu_persist, Llu);
+#endif
+
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+    /* Redistribute B into X on the diagonal processes. */
+    psReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x,
+			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t);
+    fflush(stdout);
+    t = SuperLU_timer_();
+#endif
+
+    /* Set up the headers in lsum[]. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	}
+    }
+
+	/* ---------------------------------------------------------
+	   Initialize the async Bcast trees on all processes.
+	   --------------------------------------------------------- */
+	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+	nbtree = 0;
+	for (lk=0;lk<nsupers_j;++lk){
+		if(LBtree_ptr[lk]!=NULL){
+			// printf("LBtree_ptr lk %5d\n",lk);
+			if(BcTree_IsRoot(LBtree_ptr[lk],'s')==NO){
+				nbtree++;
+				if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0)nfrecvx_buf++;
+			}
+			BcTree_allocateRequest(LBtree_ptr[lk],'s');
+		}
+	}
+
+	nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	if ( !(	leafsups = (int_t*)intCalloc_dist(nsupers_i)) )
+		ABORT("Calloc fails for leafsups.");
+
+	nrtree = 0;
+	nleaf=0;
+	nfrecvmod=0;
+
+
+
+if(procs==1){
+	for (lk=0;lk<nsupers_i;++lk){
+		gb = myrow+lk*grid->nprow;  /* not sure */
+		if(gb<nsupers){
+			if (fmod[lk*aln_i]==0){
+				leafsups[nleaf]=gb;
+				++nleaf;
+			}
+		}
+	}
+}else{
+	for (lk=0;lk<nsupers_i;++lk){
+		if(LRtree_ptr[lk]!=NULL){
+			nrtree++;
+			RdTree_allocateRequest(LRtree_ptr[lk],'s');
+			frecv[lk] = RdTree_GetDestCount(LRtree_ptr[lk],'s');
+			nfrecvmod += frecv[lk];
+		}else{
+			gb = myrow+lk*grid->nprow;  /* not sure */
+			if(gb<nsupers){
+				kcol = PCOL( gb, grid );
+				if(mycol==kcol) { /* Diagonal process */
+					if (fmod[lk*aln_i]==0){
+						leafsups[nleaf]=gb;
+						++nleaf;
+					}
+				}
+			}
+		}
+	}
+}
+
+
+#ifdef _OPENMP
+#pragma omp simd
+#endif
+	for (i = 0; i < nlb; ++i) fmod[i*aln_i] += frecv[i];
+
+	if ( !(recvbuf_BC_fwd = (float*)SUPERLU_MALLOC(maxrecvsz*(nfrecvx+1) * sizeof(float))) )  // this needs to be optimized for 1D row mapping
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nfrecvx_buf=0;
+
+	log_memory(nlb*aln_i*iword+nlb*iword+(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*2.0*iword+ nsupers_i*iword + sizelsum*num_thread * dword + (ldalsum * nrhs + nlb * XK_H) *dword + (sizertemp*num_thread + 1)*dword+maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, root_send, leafsups, recvbuf_BC_fwd	, lsum, x, rtemp
+
+
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n,  nbtree %4d\n,  nrtree %4d\n",
+			iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree);
+	fflush(stdout);
+#endif
+
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t);
+	fflush(stdout);
+	MPI_Barrier( grid->comm );
+	t = SuperLU_timer_();
+#endif
+
+#if ( VAMPIR>=1 )
+	// VT_initialize();
+	VT_traceon();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* ---------------------------------------------------------
+	   Solve the leaf nodes first by all the diagonal processes.
+	   --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) nleaf %4d\n", iam, nleaf);
+	fflush(stdout);
+#endif
+
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+	{
+	    int thread_id = omp_get_thread_num();
+#else
+	{
+ 	    thread_id=0;
+#endif
+		{
+
+            if (Llu->inv == 1) { /* Diagonal is inverted. */
+
+#ifdef _OPENMP
+#pragma	omp for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
+#endif
+		for (jj=0;jj<nleaf;jj++){
+		    k=leafsups[jj];
+
+// #ifdef _OPENMP
+// #pragma omp task firstprivate (k,nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc)
+// #endif
+   		    {
+
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+			rtemp_loc = &rtemp[sizertemp* thread_id];
+
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid );
+
+			ii = X_BLK( lk );
+			lk = LBj( k, grid ); /* Local block number, column-wise. */
+			lsub = Lrowind_bc_ptr[lk];
+			lusup = Lnzval_bc_ptr[lk];
+
+			nsupr = lsub[1];
+			Linv = Linv_bc_ptr[lk];
+#ifdef _CRAY
+			SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+			sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+			sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc );
+#endif
+
+#ifdef _OPENMP
+#pragma omp simd
+#endif
+			for (i=0 ; i<knsupc*nrhs ; i++){
+				x[ii+i] = rtemp_loc[i];
+			}
+
+			// for (i=0 ; i<knsupc*nrhs ; i++){
+			// printf("x_l: %f\n",x[ii+i]);
+			// fflush(stdout);
+			// }
+
+
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+
+#endif
+
+			stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+			// --nleaf;
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+			/*
+			 * Send Xk to process column Pc[k].
+			 */
+
+			if(LBtree_ptr[lk]!=NULL){
+				lib = LBi( k, grid ); /* Local block number, row-wise. */
+				ii = X_BLK( lib );
+
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+				nleaf_send_tmp = ++nleaf_send;
+				leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+				// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s');
+			}
+		    }
+	     }
+	} else { /* Diagonal is not inverted. */
+#ifdef _OPENMP
+#pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
+#endif
+	    for (jj=0;jj<nleaf;jj++) {
+		k=leafsups[jj];
+		{
+
+#if ( PROFlevel>=1 )
+		    TIC(t1);
+#endif
+		    rtemp_loc = &rtemp[sizertemp* thread_id];
+
+		    knsupc = SuperSize( k );
+		    lk = LBi( k, grid );
+
+		    ii = X_BLK( lk );
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+
+		    nsupr = lsub[1];
+
+#ifdef _CRAY
+   		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+				lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+				lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+ 		    strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+					lusup, &nsupr, &x[ii], &knsupc);
+#endif
+
+#if ( PROFlevel>=1 )
+		    TOC(t2, t1);
+		    stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+
+#endif
+
+		    stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+
+		    // --nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+
+		    if (LBtree_ptr[lk]!=NULL) {
+			lib = LBi( k, grid ); /* Local block number, row-wise. */
+			ii = X_BLK( lib );
+
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+			nleaf_send_tmp = ++nleaf_send;
+			leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+		    }
+		    } /* end a block */
+		} /* end for jj ... */
+	    } /* end else ... diagonal is not inverted */
+	  }
+	} /* end parallel region */
+
+	jj=0;
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+#endif
+	{
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	    {
+
+#ifdef _OPENMP
+#pragma	omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
+#endif
+		for (jj=0;jj<nleaf;jj++){
+		    k=leafsups[jj];
+
+		    {
+#ifdef _OPENMP
+                        thread_id=omp_get_thread_num();
+#else
+                        thread_id=0;
+#endif
+			/* Diagonal process */
+			lk = LBi( k, grid );
+			ii = X_BLK( lk );
+			/*
+			 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+			 */
+			slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, fmod, xsup, grid, Llu, stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
+		    }
+		} /* for jj ... */
+	    }
+
+	}
+
+	for (i=0;i<nleaf_send;i++){
+		lk = leaf_send[i*aln_i];
+		if(lk>=0){ // this is a bcast forwarding
+			gb = mycol+lk*grid->npcol;  /* not sure */
+			lib = LBi( gb, grid ); /* Local block number, row-wise. */
+			ii = X_BLK( lib );
+			BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+		}else{ // this is a reduce forwarding
+			lk = -lk - 1;
+			il = LSUM_BLK( lk );
+			RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+		}
+	}
+
+
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
+#endif
+
+	/* -----------------------------------------------------------
+	   Compute the internal nodes asynchronously by all processes.
+	   ----------------------------------------------------------- */
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+	{
+        int thread_id = omp_get_thread_num();
+#else
+	{
+	thread_id = 0;
+#endif
+#ifdef _OPENMP
+#pragma omp master
+#endif
+		{
+		for ( nfrecv =0; nfrecv<nfrecvx+nfrecvmod;nfrecv++) { /* While not finished. */
+			thread_id = 0;
+#if ( PROFlevel>=1 )
+		   	TIC(t1);
+			// msgcnt[1] = maxrecvsz;
+#endif
+
+			recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
+
+			/* Receive a message. */
+			MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT,
+				MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+			// MPI_Irecv(recvbuf0,maxrecvsz,MPI_FLOAT,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
+			// ready=0;
+			// while(ready==0){
+			// MPI_Test(&req,&ready,&status);
+			// #pragma omp taskyield
+			// }
+
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_COMM] += t2;
+
+			msg_cnt += 1;
+			msg_vol += maxrecvsz * dword;
+#endif
+
+			{
+
+			k = *recvbuf0;
+
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+			if(status.MPI_TAG==BC_L){
+				// --nfrecvx;
+				nfrecvx_buf++;
+				{
+				lk = LBj( k, grid );    /* local block number */
+
+				if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0){
+
+					BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+					// nfrecvx_buf++;
+				}
+
+				/*
+				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+				 */
+
+				lk = LBj( k, grid ); /* Local block number, column-wise. */
+				lsub = Lrowind_bc_ptr[lk];
+				lusup = Lnzval_bc_ptr[lk];
+				if ( lsub ) {
+					krow = PROW( k, grid );
+					if(myrow==krow){
+					    nb = lsub[0] - 1;
+					    knsupc = SuperSize( k );
+					    ii = X_BLK( LBi( k, grid ) );
+					    xin = &x[ii];
+				        }else{
+					    nb   = lsub[0];
+					    knsupc = SuperSize( k );
+					    xin = &recvbuf0[XK_H] ;
+					}
+					slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, 
+					    fmod, nb, xsup, grid, Llu, 
+					    stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
+
+				} /* if lsub */
+			    }
+			}else if(status.MPI_TAG==RD_L){
+				// --nfrecvmod;
+				lk = LBi( k, grid ); /* Local block number, row-wise. */
+
+				knsupc = SuperSize( k );
+				tempv = &recvbuf0[LSUM_H];
+				il = LSUM_BLK( lk );
+				RHS_ITERATE(j) {
+				for (i = 0; i < knsupc; ++i)
+					lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc];
+				} 
+
+			// #ifdef _OPENMP
+			// #pragma omp atomic capture
+			// #endif
+				fmod_tmp=--fmod[lk*aln_i];
+				{
+				thread_id = 0;
+				rtemp_loc = &rtemp[sizertemp* thread_id];
+				if ( fmod_tmp==0 ) {
+				    if(RdTree_IsRoot(LRtree_ptr[lk],'s')==YES){
+				    // ii = X_BLK( lk );
+					knsupc = SuperSize( k );
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+					    for (jj=0;jj<knsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+					ii = X_BLK( lk );
+					RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+					    for (i = 0; i < knsupc; ++i)
+					        x[i + ii + j*knsupc] += lsum[i + il + j*knsupc];
+
+					// fmod[lk] = -1; /* Do not solve X[k] in the future. */
+					lk = LBj( k, grid ); /* Local block number, column-wise. */
+					lsub = Lrowind_bc_ptr[lk];
+					lusup = Lnzval_bc_ptr[lk];
+					nsupr = lsub[1];
+
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+					if(Llu->inv == 1){
+						Linv = Linv_bc_ptr[lk];
+#ifdef _CRAY
+						SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Linv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+						sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+							&alpha, Linv, &knsupc, &x[ii],
+							&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+						sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+							&alpha, Linv, &knsupc, &x[ii],
+							&knsupc, &beta, rtemp_loc, &knsupc );
+#endif
+					#ifdef _OPENMP
+					#pragma omp simd
+					#endif
+						for (i=0 ; i<knsupc*nrhs ; i++){
+							x[ii+i] = rtemp_loc[i];
+						}
+					} else{ /* Llu->innv == 0 */
+#ifdef _CRAY
+					    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+					    lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+					    strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+					      lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+					    strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+					        lusup, &nsupr, &x[ii], &knsupc);
+#endif
+					} /* end if-else */
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+
+					stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+					printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
+					if(LBtree_ptr[lk]!=NULL){
+					    BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+					}
+					/*
+					 * Perform local block modifications.
+					 */
+					lk = LBj( k, grid ); /* Local block number, column-wise. */
+					lsub = Lrowind_bc_ptr[lk];
+					lusup = Lnzval_bc_ptr[lk];
+					if ( lsub ) {
+						krow = PROW( k, grid );
+						nb = lsub[0] - 1;
+						knsupc = SuperSize( k );
+						ii = X_BLK( LBi( k, grid ) );
+						xin = &x[ii];
+						slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
+							fmod, nb, xsup, grid, Llu,
+							stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
+					} /* if lsub */
+					// }
+
+				    }else{ /* fmod_tmp != 0 */
+					il = LSUM_BLK( lk );
+					knsupc = SuperSize( k );
+					for (ii=1;ii<num_thread;ii++)
+					#ifdef _OPENMP
+					#pragma omp simd
+					#endif
+					    for (jj=0;jj<knsupc*nrhs;jj++)
+						lsum[il + jj] += lsum[il + jj + ii*sizelsum];
+					RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+				    } /* end if-else */
+
+				    }
+
+				} /* end else MPI_TAG==RD_L */
+			        } /* check Tag */
+			} /* end for nfrecv ... */
+                    } /* while not finished ... */
+       	    }
+        } // Sherry: add back
+
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	stat->utime[SOL_TOT] += t;
+	if ( !iam ) {
+		printf(".. L-solve time\t%8.4f\n", t);
+		fflush(stdout);
+	}
+
+	MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm);
+	if ( !iam ) {
+		printf(".. L-solve time (MAX) \t%8.4f\n", tmax);
+		fflush(stdout);
+	}
+	t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel==2 )
+	{
+		printf("(%d) .. After L-solve: y =\n", iam);
+		for (i = 0, k = 0; k < nsupers; ++k) {
+			krow = PROW( k, grid );
+			kcol = PCOL( k, grid );
+			if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+				knsupc = SuperSize( k );
+				lk = LBi( k, grid );
+				ii = X_BLK( lk );
+				for (j = 0; j < knsupc; ++j)
+					printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+					fflush(stdout);
+				}
+			MPI_Barrier( grid->comm );
+		}
+	}
+#endif
+
+	SUPERLU_FREE(fmod);
+	SUPERLU_FREE(frecv);
+	SUPERLU_FREE(leaf_send);
+	SUPERLU_FREE(leafsups);
+	SUPERLU_FREE(recvbuf_BC_fwd);
+	log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
+
+	for (lk=0;lk<nsupers_j;++lk){
+		if(LBtree_ptr[lk]!=NULL){
+			// if(BcTree_IsRoot(LBtree_ptr[lk],'s')==YES){
+			BcTree_waitSendRequest(LBtree_ptr[lk],'s');
+			// }
+			// deallocate requests here
+		}
+	}
+
+	for (lk=0;lk<nsupers_i;++lk){
+		if(LRtree_ptr[lk]!=NULL){
+			RdTree_waitSendRequest(LRtree_ptr[lk],'s');
+			// deallocate requests here
+		}
+	}
+	MPI_Barrier( grid->comm );
+
+#if ( VAMPIR>=1 )
+	VT_traceoff();
+	VT_finalize();
+#endif
+
+
+	/*---------------------------------------------------
+	 * Back solve Ux = y.
+	 *
+	 * The Y components from the forward solve is already
+	 * on the diagonal processes.
+	 *---------------------------------------------------*/
+
+	/* Save the count to be altered so it can be used by
+	   subsequent call to PDGSTRS. */
+	if ( !(bmod = intMalloc_dist(nlb*aln_i)) )
+		ABORT("Malloc fails for bmod[].");
+	for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i];
+	if ( !(brecv = intCalloc_dist(nlb)) )
+		ABORT("Calloc fails for brecv[].");
+	Llu->brecv = brecv;
+
+	k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+
+	/* Re-initialize lsum to zero. Each block header is already in place. */
+
+#ifdef _OPENMP
+
+#pragma omp parallel default(shared) private(ii)
+	{
+                int thread_id = omp_get_thread_num();
+		for(ii=0;ii<sizelsum;ii++)
+			lsum[thread_id*sizelsum+ii]=zero;
+	}
+  	/* Set up the headers in lsum[]. */
+//#pragma omp simd lastprivate(krow,lk,il)
+    	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+	       lk = LBi( k, grid );   /* Local block number. */
+	       il = LSUM_BLK( lk );
+	       lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	    }
+    	}
+
+#else
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		il = LSUM_BLK( lk );
+		dest = &lsum[il];
+
+		for (jj = 0; jj < num_thread; ++jj) {
+		    RHS_ITERATE(j) {
+			for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + jj*sizelsum] = zero;
+		    }
+		}
+	    }
+	}
+#endif
+
+#if ( DEBUGlevel>=2 )
+	for (p = 0; p < Pr*Pc; ++p) {
+	    if (iam == p) {
+		printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+		for (lb = 0; lb < nub; ++lb) {
+		    printf("(%2d) Local col %2d: # row blocks %2d\n",
+				iam, lb, Urbs[lb]);
+		    if ( Urbs[lb] ) {
+			for (i = 0; i < Urbs[lb]; ++i)
+			    printf("(%2d) .. row blk %2d:\
+				    lbnum %d, indpos %d, valpos %d\n",
+				    iam, i,
+				    Ucb_indptr[lb][i].lbnum,
+				    Ucb_indptr[lb][i].indpos,
+				    Ucb_valptr[lb][i]);
+		     }
+		}
+	    }
+	    MPI_Barrier( grid->comm );
+	}
+	for (p = 0; p < Pr*Pc; ++p) {
+		if ( iam == p ) {
+			printf("\n(%d) bsendx_plist[][]", iam);
+			for (lb = 0; lb < nub; ++lb) {
+				printf("\n(%d) .. local col %2d: ", iam, lb);
+				for (i = 0; i < Pr; ++i)
+					printf("%4d", bsendx_plist[lb][i]);
+			}
+			printf("\n");
+		}
+		MPI_Barrier( grid->comm );
+	}
+#endif /* DEBUGlevel */
+
+	/* ---------------------------------------------------------
+	   Initialize the async Bcast trees on all processes.
+	   --------------------------------------------------------- */
+	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+	nbtree = 0;
+	for (lk=0;lk<nsupers_j;++lk){
+	    if(UBtree_ptr[lk]!=NULL){
+		// printf("UBtree_ptr lk %5d\n",lk);
+		if(BcTree_IsRoot(UBtree_ptr[lk],'s')==NO){
+			nbtree++;
+			if(BcTree_getDestCount(UBtree_ptr[lk],'s')>0)nbrecvx_buf++;
+		}
+		BcTree_allocateRequest(UBtree_ptr[lk],'s');
+	    }
+	}
+
+	nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	if ( !(	rootsups = (int_t*)intCalloc_dist(nsupers_i)) )
+		ABORT("Calloc fails for rootsups.");
+
+	nrtree = 0;
+	nroot=0;
+	for (lk=0;lk<nsupers_i;++lk){
+		if(URtree_ptr[lk]!=NULL){
+			// printf("here lk %5d myid %5d\n",lk,iam);
+			// fflush(stdout);
+			nrtree++;
+			RdTree_allocateRequest(URtree_ptr[lk],'s');
+			brecv[lk] = RdTree_GetDestCount(URtree_ptr[lk],'s');
+			nbrecvmod += brecv[lk];
+		}else{
+			gb = myrow+lk*grid->nprow;  /* not sure */
+			if(gb<nsupers){
+				kcol = PCOL( gb, grid );
+				if(mycol==kcol) { /* Diagonal process */
+					if (bmod[lk*aln_i]==0){
+						rootsups[nroot]=gb;
+						++nroot;
+					}
+				}
+			}
+		}
+	}
+
+#ifdef _OPENMP
+#pragma omp simd
+#endif
+	for (i = 0; i < nlb; ++i) bmod[i*aln_i] += brecv[i];
+	// for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]);
+
+	if ( !(recvbuf_BC_fwd = (float*)SUPERLU_MALLOC(maxrecvsz*(nbrecvx+1) * sizeof(float))) )  // this needs to be optimized for 1D row mapping
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");
+	nbrecvx_buf=0;
+
+	log_memory(nlb*aln_i*iword+nlb*iword + nsupers_i*iword + maxrecvsz*(nbrecvx+1)*dword, stat);	//account for bmod, brecv, rootsups, recvbuf_BC_fwd
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) nbrecvx %4d,  nbrecvmod %4d,  nroot %4d\n,  nbtree %4d\n,  nrtree %4d\n",
+			iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree);
+	fflush(stdout);
+#endif
+
+
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t);
+	fflush(stdout);
+	MPI_Barrier( grid->comm );
+	t = SuperLU_timer_();
+#endif
+
+	/*
+	 * Solve the roots first by all the diagonal processes.
+	 */
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) nroot %4d\n", iam, nroot);
+	fflush(stdout);
+#endif
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+#endif
+	{
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	    {
+#ifdef _OPENMP
+#pragma	omp taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp,thread_id) nogroup
+#endif
+		for (jj=0;jj<nroot;jj++){
+			k=rootsups[jj];
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+#ifdef _OPENMP
+			thread_id = omp_get_thread_num ();
+#else
+			thread_id = 0;
+#endif
+			rtemp_loc = &rtemp[sizertemp* thread_id];
+
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
+
+			// bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+			ii = X_BLK( lk );
+			lk = LBj( k, grid ); /* Local block number, column-wise */
+			lsub = Lrowind_bc_ptr[lk];
+			lusup = Lnzval_bc_ptr[lk];
+			nsupr = lsub[1];
+
+			if(Llu->inv == 1){
+
+				Uinv = Uinv_bc_ptr[lk];
+#ifdef _CRAY
+				SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+				sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+				sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#endif
+				#ifdef _OPENMP
+					#pragma omp simd
+				#endif
+				for (i=0 ; i<knsupc*nrhs ; i++){
+					x[ii+i] = rtemp_loc[i];
+				}
+			}else{
+#ifdef _CRAY
+				STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+				strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+				strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc);
+#endif
+			}
+
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+			stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+			/*
+			 * Send Xk to process column Pc[k].
+			 */
+
+			if(UBtree_ptr[lk]!=NULL){
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+				nroot_send_tmp = ++nroot_send;
+				root_send[(nroot_send_tmp-1)*aln_i] = lk;
+
+			}
+		} /* for jj ... */
+	    } /* omp master region */
+	} /* omp parallel region */
+
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+#endif
+	{
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	    {
+#ifdef _OPENMP
+#pragma	omp taskloop private (ii,jj,k,lk,thread_id) nogroup
+#endif
+		for (jj=0;jj<nroot;jj++){
+			k=rootsups[jj];
+			lk = LBi( k, grid ); /* Local block number, row-wise. */
+			ii = X_BLK( lk );
+			lk = LBj( k, grid ); /* Local block number, column-wise */
+#ifdef _OPENMP
+                        thread_id=omp_get_thread_num();
+#else
+                        thread_id=0;
+#endif
+			/*
+			 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+			 */
+			if ( Urbs[lk] )
+			    slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
+					Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+					stat_loc, root_send, &nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+
+		} /* for jj ... */
+
+	    } /* omp master region */
+	} /* omp parallel */
+
+for (i=0;i<nroot_send;i++){
+	lk = root_send[(i)*aln_i];
+	if(lk>=0){ // this is a bcast forwarding
+		gb = mycol+lk*grid->npcol;  /* not sure */
+		lib = LBi( gb, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lib );
+		BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+	}else{ // this is a reduce forwarding
+		lk = -lk - 1;
+		il = LSUM_BLK( lk );
+		RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+	}
+}
+
+	/*
+	 * Compute the internal nodes asychronously by all processes.
+	 */
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared)
+	{
+	    int thread_id=omp_get_thread_num();
+#else 
+        {
+	    thread_id = 0;
+#endif
+#ifdef _OPENMP
+#pragma omp master
+#endif
+	    for ( nbrecv =0; nbrecv<nbrecvx+nbrecvmod;nbrecv++) { /* While not finished. */
+		// printf("iam %4d nbrecv %4d nbrecvx %4d nbrecvmod %4d\n", iam, nbrecv, nbrecvxnbrecvmod);
+		// fflush(stdout);
+
+		thread_id = 0;
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+
+		recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
+
+		/* Receive a message. */
+		MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT,
+			MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_COMM] += t2;
+			msg_cnt += 1;
+			msg_vol += maxrecvsz * dword;
+#endif
+
+		k = *recvbuf0;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+		fflush(stdout);
+#endif
+		if(status.MPI_TAG==BC_U){
+		    // --nfrecvx;
+		    nbrecvx_buf++;
+		    lk = LBj( k, grid );    /* local block number */
+		    if(BcTree_getDestCount(UBtree_ptr[lk],'s')>0){
+
+			BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+			// nfrecvx_buf++;
+		    }
+
+		    /*
+		     * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		     */
+
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    slsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,
+				Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+		}else if(status.MPI_TAG==RD_U){
+
+		    lk = LBi( k, grid ); /* Local block number, row-wise. */
+
+		    knsupc = SuperSize( k );
+		    tempv = &recvbuf0[LSUM_H];
+		    il = LSUM_BLK( lk );
+		    RHS_ITERATE(j) {
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+			for (i = 0; i < knsupc; ++i)
+			    lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc];
+		    }
+		// #ifdef _OPENMP
+		// #pragma omp atomic capture
+		// #endif
+		    bmod_tmp=--bmod[lk*aln_i];
+		    thread_id = 0;
+		    rtemp_loc = &rtemp[sizertemp* thread_id];
+		    if ( bmod_tmp==0 ) {
+			if(RdTree_IsRoot(URtree_ptr[lk],'s')==YES){
+
+			    knsupc = SuperSize( k );
+			    for (ii=1;ii<num_thread;ii++)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+				for (jj=0;jj<knsupc*nrhs;jj++)
+					lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];
+
+			    ii = X_BLK( lk );
+			    RHS_ITERATE(j)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+				for (i = 0; i < knsupc; ++i)
+				    x[i + ii + j*knsupc] += lsum[i + il + j*knsupc ];
+
+			    lk = LBj( k, grid ); /* Local block number, column-wise. */
+			    lsub = Lrowind_bc_ptr[lk];
+			    lusup = Lnzval_bc_ptr[lk];
+			    nsupr = lsub[1];
+
+			    if(Llu->inv == 1){
+
+				Uinv = Uinv_bc_ptr[lk];
+
+#ifdef _CRAY
+				SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+				sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+				sgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#endif
+
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+				for (i=0 ; i<knsupc*nrhs ; i++){
+					x[ii+i] = rtemp_loc[i];
+				}
+			    }else{ /* Llu->inv == 0 */
+#ifdef _CRAY
+				STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+				strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+					lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+				strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+					lusup, &nsupr, &x[ii], &knsupc);
+#endif
+			    }
+
+#if ( PROFlevel>=1 )
+			    TOC(t2, t1);
+			    stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+			    stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+			    /*
+			     * Send Xk to process column Pc[k].
+			     */
+			    if(UBtree_ptr[lk]!=NULL){
+				BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+			    }
+
+			    /*
+			     * Perform local block modifications:
+			     *         lsum[i] -= U_i,k * X[k]
+			     */
+			    if ( Urbs[lk] )
+				slsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
+					Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+					stat_loc, sizelsum,sizertemp,thread_id,num_thread);
+
+			    }else{
+				il = LSUM_BLK( lk );
+				knsupc = SuperSize( k );
+
+				for (ii=1;ii<num_thread;ii++)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+				    for (jj=0;jj<knsupc*nrhs;jj++)
+					lsum[il+ jj ] += lsum[il + jj + ii*sizelsum];
+
+				RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il-LSUM_H],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+			    } /* end if-else */
+			} /* end if bmod_tmp==0 */
+		    } /* end if MPI_TAG==RD_U */
+		} /* while not finished ... */
+	} /* end parallel region */
+
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	stat->utime[SOL_TOT] += t;
+	if ( !iam ) printf(".. U-solve time\t%8.4f\n", t);
+	MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm);
+	if ( !iam ) {
+		printf(".. U-solve time (MAX) \t%8.4f\n", tmax);
+		fflush(stdout);
+	}
+	t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+        {
+	    float *x_col;
+	    int diag;
+	    printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
+	    ii = 0;
+	    for (k = 0; k < nsupers; ++k) {
+	    	knsupc = SuperSize( k );
+		krow = PROW( k, grid );
+		kcol = PCOL( k, grid );
+		diag = PNUM( krow, kcol, grid);
+		if ( iam == diag ) { /* Diagonal process. */
+		   lk = LBi( k, grid );
+		   jj = X_BLK( lk );
+		   x_col = &x[jj];
+		   RHS_ITERATE(j) {
+		       for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
+			   printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+i, x_col[i]);
+		       }      
+		       x_col += knsupc;
+		   }
+		}
+		ii += knsupc;
+	    } /* for k ... */
+	}
+#endif
+
+	psReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+				ScalePermstruct, Glu_persist, grid, SOLVEstruct);
+
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
+	t = SuperLU_timer_();
+#endif
+
+	double tmp1=0;
+	double tmp2=0;
+	double tmp3=0;
+	double tmp4=0;
+	for(i=0;i<num_thread;i++){
+		tmp1 = SUPERLU_MAX(tmp1,stat_loc[i]->utime[SOL_TRSM]);
+		tmp2 = SUPERLU_MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]);
+		tmp3 = SUPERLU_MAX(tmp3,stat_loc[i]->utime[SOL_COMM]);
+		tmp4 += stat_loc[i]->ops[SOLVE];
+#if ( PRNTlevel>=2 )
+		f(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
+#endif
+	}
+
+	stat->utime[SOL_TRSM] += tmp1;
+	stat->utime[SOL_GEMM] += tmp2;
+	stat->utime[SOL_COMM] += tmp3;
+	stat->ops[SOLVE]+= tmp4;
+
+	/* Deallocate storage. */
+	for(i=0;i<num_thread;i++){
+		PStatFree(stat_loc[i]);
+		SUPERLU_FREE(stat_loc[i]);
+	}
+	SUPERLU_FREE(stat_loc);
+	SUPERLU_FREE(rtemp);
+	SUPERLU_FREE(lsum);
+	SUPERLU_FREE(x);
+
+	SUPERLU_FREE(bmod);
+	SUPERLU_FREE(brecv);
+	SUPERLU_FREE(root_send);
+
+	SUPERLU_FREE(rootsups);
+	SUPERLU_FREE(recvbuf_BC_fwd);
+
+	log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword - sizelsum*num_thread * dword - (ldalsum * nrhs + nlb * XK_H) *dword - (sizertemp*num_thread + 1)*dword, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x
+
+	for (lk=0;lk<nsupers_j;++lk){
+		if(UBtree_ptr[lk]!=NULL){
+			// if(BcTree_IsRoot(LBtree_ptr[lk],'s')==YES){
+			BcTree_waitSendRequest(UBtree_ptr[lk],'s');
+			// }
+			// deallocate requests here
+		}
+	}
+
+	for (lk=0;lk<nsupers_i;++lk){
+		if(URtree_ptr[lk]!=NULL){
+			RdTree_waitSendRequest(URtree_ptr[lk],'s');
+			// deallocate requests here
+		}
+	}
+	MPI_Barrier( grid->comm );
+
+
+#if ( PROFlevel>=2 )
+	{
+		float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+		MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+				1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&msg_cnt, &msg_cnt_max,
+				1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		MPI_Reduce (&msg_vol, &msg_vol_sum,
+				1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&msg_vol, &msg_vol_max,
+				1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		if (!iam) {
+			printf ("\tPDGSTRS comm stat:"
+				"\tAvg\tMax\t\tAvg\tMax\n"
+				"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+				msg_cnt_sum / Pr / Pc, msg_cnt_max,
+				msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+		}
+	}
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t1_sol;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgstrs()");
+#endif
+
+#if ( PRNTlevel>=2 )
+    float for_lu, total, max, avg, temp;
+    superlu_dist_mem_usage_t num_mem_usage;
+
+    sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+    temp = num_mem_usage.total;
+
+    MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+    MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+    if (!iam) {
+	printf("\n** Memory Usage **********************************\n");
+        printf("** Total highmark (MB):\n"
+	       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+	       avg * 1e-6,
+	       avg / grid->nprow / grid->npcol * 1e-6,
+	       max * 1e-6);
+	printf("**************************************************\n");
+	fflush(stdout);
+    }
+#endif
+
+    return;
+} /* PSGSTRS */
+
diff --git a/SRC/psgstrs1.c b/SRC/psgstrs1.c
new file mode 100644
index 00000000..83082157
--- /dev/null
+++ b/SRC/psgstrs1.c
@@ -0,0 +1,910 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of distributed linear equations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*,
+		   float*, int*, float*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*,
+		   int*, float*, int*, float*, float*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PSGSTRS1 solves a system of distributed linear equations
+ *
+ *                   op( sub(A) ) * X = sub( B )
+ *
+ * with a general N-by-N distributed matrix sub( A ) using the LU
+ * factorization computed by PSGSTRF.
+ *
+ * This routine is used only in the iterative refinement routine
+ * psgsrfs_ABXglobal, assuming that the right-hand side is already
+ * distributed in the diagonal processes.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures to store L and U factors,
+ *        and the permutation vectors.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t' structure.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * x      (input/output) float*
+ *        On entry, the right hand side matrix.
+ *        On exit, the solution matrix if info = 0;
+ *
+ *        NOTE: the right-hand side matrix is already distributed on
+ *              the diagonal processes.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves;
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>
+ */
+
+void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
+	      float *x, int nrhs, SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    float alpha = 1.0;
+    float *lsum;  /* Local running sum of the updates to B-components */
+    float *lusup, *dest;
+    float *recvbuf, *tempv;
+    float *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int    iam, kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int_t  Pc, Pr;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    float **Lnzval_bc_ptr;
+    MPI_Status status;
+#ifdef ISEND_IRECV
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -8;
+    if ( *info ) {
+	pxerr_dist("PSGSTRS1", grid, -*info);
+	return;
+    }
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgstrs1()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PSGSTRS1. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#ifdef ISEND_IRECV
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Compute ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H);
+    if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = floatCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Prepended the block number in the header for lsum[].
+     */
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k;
+	}
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* local block number */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* diagonal process */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( !frecv[lk] && !fmod[lk] ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_FLOAT, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_FLOAT,
+                                 pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
+			   send_req, stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /*
+     * Compute the internal nodes asynchronously by all processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM:
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p)
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     MPI_FLOAT, pi, Xk, grid->comm,
+				     &send_req[Llu->SolveMsgSent++] );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam,  status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    if ( !iam ) printf("\n.. After L-solve: y =\n");
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PSGSTRS1. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nlb is the number of local block rows. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column.
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb)
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i,
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( !brecv[lk] && !bmod[lk] ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		/*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p)
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_FLOAT, pi, Xk, grid->comm,
+				   &send_req[Llu->SolveMsgSent++] );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				 MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] )
+		    slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asynchronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT,
+                 MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM:
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+		if ( !(--brecv[lk]) && !bmod[lk] ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p)
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_FLOAT, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+
+		    /*
+		     * Perform local block modifications:
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgstrs1()");
+#endif
+
+} /* PSGSTRS1 */
diff --git a/SRC/psgstrs_Bglobal.c b/SRC/psgstrs_Bglobal.c
new file mode 100644
index 00000000..eb972a16
--- /dev/null
+++ b/SRC/psgstrs_Bglobal.c
@@ -0,0 +1,1040 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*,
+		   float*, int*, float*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*,
+		   int*, float*, int*, float*, float*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+static void gather_diag_to_all(int_t, int_t, float [], Glu_persist_t *,
+                               sLocalLU_t *, gridinfo_t *, int_t, int_t [],
+                               int_t [], float [], int_t, float []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * psgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by psgstrf.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * </pre>
+ */
+
+void
+psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
+                float *B, int_t ldb, int nrhs,
+                SuperLUStat_t *stat, int *info)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    float alpha = 1.0;
+    float *lsum;  /* Local running sum of the updates to B-components */
+    float *x;     /* X component at step k. */
+    float *lusup, *dest;
+    float *recvbuf, *tempv;
+    float *rtemp; /* Result of full matrix-vector multiply. */
+    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  kcol, krow, mycol, myrow;
+    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
+    int_t  nb, nlb, nub, nsupers;
+    int_t  *xsup, *lsub, *usub;
+    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+    int    Pc, Pr, iam;
+    int    knsupc, nsupr;
+    int    ldalsum;   /* Number of lsum entries locally owned. */
+    int    maxrecvsz, p, pi;
+    int_t  **Lrowind_bc_ptr;
+    float **Lnzval_bc_ptr;
+    MPI_Status status;
+#if defined (ISEND_IRECV) || defined (BSEND)
+    MPI_Request *send_req, recv_req;
+#endif
+
+    /*-- Counts used for L-solve --*/
+    int_t  *fmod;         /* Modification count for L-solve. */
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int_t  *frecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int_t  nleaf = 0, nroot = 0;
+
+    /*-- Counts used for U-solve --*/
+    int_t  *bmod;         /* Modification count for L-solve. */
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int_t  *brecv;        /* Count of modifications to be recv'd from
+			     processes in this row. */
+    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+    double t;
+#if ( DEBUGlevel>=2 )
+    int_t Ublocks = 0;
+#endif
+
+    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+
+    t = SuperLU_timer_();
+
+    /* Test input parameters. */
+    *info = 0;
+    if ( n < 0 ) *info = -1;
+    else if ( nrhs < 0 ) *info = -9;
+    if ( *info ) {
+	pxerr_dist("PSGSTRS_BGLOBAL", grid, -*info);
+	return;
+    }
+
+    /*
+     * Initialization.
+     */
+    iam = grid->iam;
+    Pc = grid->npcol;
+    Pr = grid->nprow;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
+    stat->ops[SOLVE] = 0.0;
+    Llu->SolveMsgSent = 0;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter psgstrs_Bglobal()");
+#endif
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(fmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for fmod[].");
+    for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
+    if ( !(frecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for frecv[].");
+    Llu->frecv = frecv;
+
+#if defined (ISEND_IRECV) || defined (BSEND)
+    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
+	ABORT("Malloc fails for send_req[].");
+#endif
+
+#ifdef _CRAY
+    ftcs1 = _cptofcd("L", strlen("L"));
+    ftcs2 = _cptofcd("N", strlen("N"));
+    ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+
+    /* Obtain ilsum[] and ldalsum for process column 0. */
+    ilsum = Llu->ilsum;
+    ldalsum = Llu->ldalsum;
+
+    /* Allocate working storage. */
+    knsupc = sp_ienv_dist(3);
+    maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+    if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs
+        + nlb * LSUM_H)) )
+	ABORT("Calloc fails for lsum[].");
+    if ( !(x = floatMalloc_dist(((size_t)ldalsum) * nrhs
+        + nlb * XK_H)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for recvbuf[].");
+    if ( !(rtemp = floatCalloc_dist(maxrecvsz)) )
+	ABORT("Malloc fails for rtemp[].");
+
+
+    /*---------------------------------------------------
+     * Forward solve Ly = b.
+     *---------------------------------------------------*/
+
+    /*
+     * Copy B into X on the diagonal processes.
+     */
+    ii = 0;
+    for (k = 0; k < nsupers; ++k) {
+	knsupc = SuperSize( k );
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    lk = LBi( k, grid );   /* Local block number. */
+	    il = LSUM_BLK( lk );
+	    lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
+	    kcol = PCOL( k, grid );
+	    if ( mycol == kcol ) { /* Diagonal process. */
+		jj = X_BLK( lk );
+		x[jj - XK_H] = k;  /* Block number prepended in the header. */
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
+			x[i + jj + j*knsupc] = B[i + ii + j*ldb];
+	    }
+	}
+	ii += knsupc;
+    }
+
+    /*
+     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol != kcol && fmod[lk] )
+		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && fmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nfrecvmod += frecv[lk];
+		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
+		    assert( frecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* ---------------------------------------------------------
+       Solve the leaf nodes first by all the diagonal processes.
+       --------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nleaf %4d\n", iam, nleaf);
+#endif
+    for (k = 0; k < nsupers && nleaf; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
+		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+		--nleaf;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( fsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_FLOAT, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_FLOAT, pi, Xk, grid->comm );
+#else
+
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				  MPI_FLOAT,
+                                  pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		 */
+		nb = lsub[0] - 1;
+		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			   fmod, nb, lptr, luptr, xsup, grid, Llu,
+			   send_req,stat);
+	    }
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+    /* -----------------------------------------------------------
+       Compute the internal nodes asynchronously by all processes.
+       ----------------------------------------------------------- */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
+	   iam, nfrecvx, nfrecvmod, nleaf);
+#endif
+
+    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+#ifdef ISEND_IRECV
+	/* -MPI- FATAL: Remote protocol queue full */
+	MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &recv_req );
+	MPI_Wait( &recv_req, &status );
+#else
+	MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+#endif
+
+	k = *recvbuf;
+
+
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	  case Xk:
+	      --nfrecvx;
+	      lk = LBj( k, grid ); /* Local block number, column-wise. */
+	      lsub = Lrowind_bc_ptr[lk];
+	      lusup = Lnzval_bc_ptr[lk];
+	      if ( lsub ) {
+		  nb   = lsub[0];
+		  lptr = BC_HEADER;
+		  luptr = 0;
+		  knsupc = SuperSize( k );
+
+		  /*
+		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+		   */
+		  slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if lsub */
+
+	      break;
+
+	  case LSUM: /* Receiver must be a diagonal process */
+	      --nfrecvmod;
+	      lk = LBi( k, grid ); /* Local block number, row-wise. */
+	      ii = X_BLK( lk );
+	      knsupc = SuperSize( k );
+	      tempv = &recvbuf[LSUM_H];
+	      RHS_ITERATE(j)
+		  for (i = 0; i < knsupc; ++i)
+		      x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
+		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		  lk = LBj( k, grid ); /* Local block number, column-wise. */
+		  lsub = Lrowind_bc_ptr[lk];
+		  lusup = Lnzval_bc_ptr[lk];
+		  nsupr = lsub[1];
+#ifdef _CRAY
+		  STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+			lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		  strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		  strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+			 lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		  stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+		  printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+
+		  /*
+		   * Send Xk to process column Pc[k].
+		   */
+		  kcol = PCOL( k, grid );
+		  for (p = 0; p < Pr; ++p) {
+		      if ( fsendx_plist[lk][p] != EMPTY ) {
+			  pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			  MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				     MPI_FLOAT, pi, Xk, grid->comm,
+				     &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			  MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                     MPI_FLOAT, pi, Xk, grid->comm );
+#else
+			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+				   MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				 iam, x[ii-XK_H], pi);
+#endif
+		      }
+                  }
+		  /*
+		   * Perform local block modifications.
+		   */
+		  nb = lsub[0] - 1;
+		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
+		  luptr = knsupc; /* Skip diagonal block L(k,k). */
+
+		  slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+			     fmod, nb, lptr, luptr, xsup, grid, Llu,
+			     send_req, stat);
+	      } /* if */
+
+	      break;
+
+#if ( DEBUGlevel>=2 )
+	    default:
+	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+	      break;
+#endif
+	  } /* switch */
+
+    } /* while not finished ... */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+#if ( DEBUGlevel>=2 )
+    printf("\n(%d) .. After L-solve: y =\n", iam);
+    for (i = 0, k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    ii = X_BLK( lk );
+	    for (j = 0; j < knsupc; ++j)
+		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+    SUPERLU_FREE(fmod);
+    SUPERLU_FREE(frecv);
+    SUPERLU_FREE(rtemp);
+
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    Llu->SolveMsgSent = 0;
+#endif
+
+
+    /*---------------------------------------------------
+     * Back solve Ux = y.
+     *
+     * The Y components from the forward solve is already
+     * on the diagonal processes.
+     *---------------------------------------------------*/
+
+    /* Save the count to be altered so it can be used by
+       subsequent call to PDGSTRS_BGLOBAL. */
+    if ( !(bmod = intMalloc_dist(nlb)) )
+	ABORT("Calloc fails for bmod[].");
+    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+    if ( !(brecv = intMalloc_dist(nlb)) )
+	ABORT("Malloc fails for brecv[].");
+    Llu->brecv = brecv;
+
+    /*
+     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
+     */
+    {
+	superlu_scope_t *scp = &grid->rscp;
+
+#if 1
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
+	    }
+	}
+
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+
+#else /* old */
+
+	for (k = 0; k < nsupers; ++k) {
+	    krow = PROW( k, grid );
+	    if ( myrow == krow ) {
+		lk = LBi( k, grid );    /* Local block number. */
+		kcol = PCOL( k, grid ); /* Root process in this row scope. */
+		if ( mycol != kcol && bmod[lk] )
+		    i = 1;  /* Contribution from non-diagonal process. */
+		else i = 0;
+		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
+			   MPI_SUM, kcol, scp->comm );
+		if ( mycol == kcol ) { /* Diagonal process. */
+		    nbrecvmod += brecv[lk];
+		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
+		    assert( brecv[lk] < Pc );
+#endif
+		}
+	    }
+	}
+#endif
+    }
+
+    /* Re-initialize lsum to zero. Each block header is already in place. */
+    for (k = 0; k < nsupers; ++k) {
+	krow = PROW( k, grid );
+	if ( myrow == krow ) {
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid );
+	    il = LSUM_BLK( lk );
+	    dest = &lsum[il];
+	    RHS_ITERATE(j)
+		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
+	}
+    }
+
+    /* Set up additional pointers for the index and value arrays of U.
+       nub is the number of local block columns. */
+    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
+    if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) )
+	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+					     blocks in a block column. */
+    Urbs1 = Urbs + nub;
+    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+        ABORT("Malloc fails for Ucb_indptr[]");
+    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+        ABORT("Malloc fails for Ucb_valptr[]");
+
+    /* Count number of row blocks in a block column.
+       One pass of the skeleton graph of U. */
+    for (lk = 0; lk < nlb; ++lk) {
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    /* usub[0] -- number of column blocks in this block row. */
+#if ( DEBUGlevel>=2 )
+	    Ublocks += usub[0];
+#endif
+	    i = BR_HEADER; /* Pointer in index array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];            /* Global block number */
+		++Urbs[LBj(k,grid)];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+    /* Set up the vertical linked lists for the row blocks.
+       One pass of the skeleton graph of U. */
+    for (lb = 0; lb < nub; ++lb) {
+	if ( Urbs[lb] ) { /* Not an empty block column. */
+	    if ( !(Ucb_indptr[lb]
+		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+		ABORT("Malloc fails for Ucb_indptr[lb][]");
+	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+		ABORT("Malloc fails for Ucb_valptr[lb][]");
+	}
+    }
+    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+	usub = Ufstnz_br_ptr[lk];
+	if ( usub ) { /* Not an empty block row. */
+	    i = BR_HEADER; /* Pointer in index array. */
+	    j = 0;         /* Pointer in nzval array. */
+	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
+		k = usub[i];          /* Global block number, column-wise. */
+		ljb = LBj( k, grid ); /* Local block number, column-wise. */
+		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+		Ucb_valptr[ljb][Urbs1[ljb]] = j;
+		++Urbs1[ljb];
+		j += usub[i+1];
+		i += UB_DESCRIPTOR + SuperSize( k );
+	    }
+	}
+    }
+
+#if ( DEBUGlevel>=2 )
+    for (p = 0; p < Pr*Pc; ++p) {
+	if (iam == p) {
+	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("(%2d) Local col %2d: # row blocks %2d\n",
+		       iam, lb, Urbs[lb]);
+		if ( Urbs[lb] ) {
+		    for (i = 0; i < Urbs[lb]; ++i)
+			printf("(%2d) .. row blk %2d:\
+                               lbnum %d, indpos %d, valpos %d\n",
+			       iam, i,
+			       Ucb_indptr[lb][i].lbnum,
+			       Ucb_indptr[lb][i].indpos,
+			       Ucb_valptr[lb][i]);
+		}
+	    }
+	}
+	MPI_Barrier( grid->comm );
+    }
+    for (p = 0; p < Pr*Pc; ++p) {
+	if ( iam == p ) {
+	    printf("\n(%d) bsendx_plist[][]", iam);
+	    for (lb = 0; lb < nub; ++lb) {
+		printf("\n(%d) .. local col %2d: ", iam, lb);
+		for (i = 0; i < Pr; ++i)
+		    printf("%4d", bsendx_plist[lb][i]);
+	    }
+	    printf("\n");
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif /* DEBUGlevel */
+
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
+    t = SuperLU_timer_();
+#endif
+
+    /*
+     * Solve the roots first by all the diagonal processes.
+     */
+#if ( DEBUGlevel>=2 )
+    printf("(%2d) nroot %4d\n", iam, nroot);
+#endif
+    for (k = nsupers-1; k >= 0 && nroot; --k) {
+	krow = PROW( k, grid );
+	kcol = PCOL( k, grid );
+	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
+	    knsupc = SuperSize( k );
+	    lk = LBi( k, grid ); /* Local block number, row-wise. */
+	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
+		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+		ii = X_BLK( lk );
+		lk = LBj( k, grid ); /* Local block number, column-wise */
+		lsub = Lrowind_bc_ptr[lk];
+		lusup = Lnzval_bc_ptr[lk];
+		nsupr = lsub[1];
+#ifdef _CRAY
+		STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+		      lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+		       lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+		--nroot;
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		/*
+		 * Send Xk to process column Pc[k].
+		 */
+		for (p = 0; p < Pr; ++p) {
+		    if ( bsendx_plist[lk][p] != EMPTY ) {
+			pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_FLOAT, pi, Xk, grid->comm,
+                                   &send_req[Llu->SolveMsgSent++]);
+#else
+#ifdef BSEND
+			MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                   MPI_FLOAT, pi, Xk, grid->comm );
+#else
+			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                  MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Sent X[%2.0f] to P %2d\n",
+			       iam, x[ii-XK_H], pi);
+#endif
+		    }
+		}
+		/*
+		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+		 */
+		if ( Urbs[lk] )
+		    slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			       send_req, stat);
+	    } /* if root ... */
+	} /* if diagonal process ... */
+    } /* for k ... */
+
+
+    /*
+     * Compute the internal nodes asynchronously by all processes.
+     */
+    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+
+	/* Receive a message. */
+	MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE,
+		 MPI_ANY_TAG, grid->comm, &status );
+
+	k = *recvbuf;
+
+#if ( DEBUGlevel>=2 )
+	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+#endif
+
+	switch ( status.MPI_TAG ) {
+	    case Xk:
+	        --nbrecvx;
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+		/*
+		 * Perform local block modifications:
+		 *         lsum[i] -= U_i,k * X[k]
+		 */
+		slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
+			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+			   send_req, stat);
+
+	        break;
+
+	    case LSUM: /* Receiver must be a diagonal process */
+		--nbrecvmod;
+		lk = LBi( k, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lk );
+		knsupc = SuperSize( k );
+		tempv = &recvbuf[LSUM_H];
+		RHS_ITERATE(j)
+		    for (i = 0; i < knsupc; ++i)
+			x[i + ii + j*knsupc] += tempv[i + j*knsupc];
+
+		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
+		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( k, grid ); /* Local block number, column-wise. */
+		    lsub = Lrowind_bc_ptr[lk];
+		    lusup = Lnzval_bc_ptr[lk];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &knsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+#else
+		    strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &knsupc);
+#endif
+		    stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, k);
+#endif
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    kcol = PCOL( k, grid );
+		    for (p = 0; p < Pr; ++p) {
+			if ( bsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, kcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_FLOAT, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                       MPI_FLOAT, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
+                                      MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii - XK_H], pi);
+#endif
+			}
+		    }
+		    /*
+		     * Perform local block modifications:
+		     *         lsum[i] -= U_i,k * X[k]
+		     */
+		    if ( Urbs[lk] )
+			slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if becomes solvable */
+
+		break;
+
+#if ( DEBUGlevel>=2 )
+	      default:
+		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
+		break;
+#endif
+
+	} /* switch */
+
+    } /* while not finished ... */
+
+#if ( PRNTlevel>=2 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
+#endif
+
+
+    /* Copy the solution X into B (on all processes). */
+    {
+	int_t num_diag_procs, *diag_procs, *diag_len;
+	float *work;
+
+	get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
+		       &diag_procs, &diag_len);
+	jj = diag_len[0];
+	for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
+	if ( !(work = floatMalloc_dist(((size_t)jj)*nrhs)) )
+	    ABORT("Malloc fails for work[]");
+	gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
+			   grid, num_diag_procs, diag_procs, diag_len,
+			   B, ldb, work);
+	SUPERLU_FREE(diag_procs);
+	SUPERLU_FREE(diag_len);
+	SUPERLU_FREE(work);
+    }
+
+    /* Deallocate storage. */
+
+    SUPERLU_FREE(lsum);
+    SUPERLU_FREE(x);
+    SUPERLU_FREE(recvbuf);
+    for (i = 0; i < nub; ++i)
+	if ( Urbs[i] ) {
+	    SUPERLU_FREE(Ucb_indptr[i]);
+	    SUPERLU_FREE(Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Ucb_indptr);
+    SUPERLU_FREE(Ucb_valptr);
+    SUPERLU_FREE(Urbs);
+    SUPERLU_FREE(bmod);
+    SUPERLU_FREE(brecv);
+#ifdef ISEND_IRECV
+    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);
+    SUPERLU_FREE(send_req);
+#endif
+#ifdef BSEND
+    SUPERLU_FREE(send_req);
+#endif
+
+    stat->utime[SOLVE] = SuperLU_timer_() - t;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit psgstrs_Bglobal()");
+#endif
+
+} /* PSGSTRS_BGLOBAL */
+
+
+/*
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ */
+static void
+gather_diag_to_all(int_t n, int_t nrhs, float x[],
+		   Glu_persist_t *Glu_persist, sLocalLU_t *Llu,
+		   gridinfo_t *grid, int_t num_diag_procs,
+		   int_t diag_procs[], int_t diag_len[],
+		   float y[], int_t ldy, float work[])
+{
+    int_t i, ii, j, k, lk, lwork, nsupers, p;
+    int_t *ilsum, *xsup;
+    int iam, knsupc, pkk;
+    float *x_col, *y_col;
+
+    iam = grid->iam;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    ilsum = Llu->ilsum;
+
+    for (p = 0; p < num_diag_procs; ++p) {
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    /* Copy x vector into a buffer. */
+	    lwork = 0;
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		lk = LBi( k, grid );
+		ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
+		x_col = &x[ii];
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
+		    lwork += knsupc;
+		    x_col += knsupc;
+		}
+	    }
+	    MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm );
+	} else {
+	    MPI_Bcast( work, diag_len[p]*nrhs, MPI_FLOAT, pkk, grid->comm );
+	}
+	/* Scatter work[] into global y vector. */
+	lwork = 0;
+	for (k = p; k < nsupers; k += num_diag_procs) {
+	    knsupc = SuperSize( k );
+	    ii = FstBlockC( k );
+	    y_col = &y[ii];
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
+		lwork += knsupc;
+		y_col += ldy;
+	    }
+	}
+    }
+} /* GATHER_DIAG_TO_ALL */
+
diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c
new file mode 100644
index 00000000..ead09ab5
--- /dev/null
+++ b/SRC/psgstrs_lsum.c
@@ -0,0 +1,2126 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+#include "superlu_defs.h"
+
+#ifndef CACHELINE
+#define CACHELINE 64  /* bytes, Xeon Phi KNL, Cori haswell, Edision */
+#endif
+
+#define ISEND_IRECV
+
+/*
+ * Function prototypes
+ */
+#ifdef _CRAY
+fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*,
+		   float*, int*, float*, int*);
+fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*,
+		   int*, float*, int*, float*, float*, int*);
+_fcd ftcs1;
+_fcd ftcs2;
+_fcd ftcs3;
+#endif
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+void slsum_fmod
+/************************************************************************/
+(
+ float *lsum,    /* Sum of local modifications.                        */
+ float *x,       /* X array (local)                                    */
+ float *xk,      /* X[k].                                              */
+ float *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int   knsupc,    /* Size of supernode k.                               */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t nlb,       /* Number of L blocks.                                */
+ int_t lptr,      /* Starting position in lsub[*].                      */
+ int_t luptr,     /* Starting position in lusup[*].                     */
+ int_t *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+)
+{
+    float alpha = 1.0, beta = 0.0;
+    float *lusup, *lusup1;
+    float *dest;
+    int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
+    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel;
+    int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+#if ( PROFlevel>=1 )
+	double t1, t2;
+	float msg_vol = 0, msg_cnt = 0;
+#endif
+#if ( PROFlevel>=1 )
+	TIC(t1);
+#endif
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    lsub = Llu->Lrowind_bc_ptr[lk];
+    lusup = Llu->Lnzval_bc_ptr[lk];
+    nsupr = lsub[1];
+
+    for (lb = 0; lb < nlb; ++lb) {
+	ik = lsub[lptr]; /* Global block number, row-wise. */
+	nbrow = lsub[lptr+1];
+#ifdef _CRAY
+	SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+	      &alpha, &lusup[luptr], &nsupr, xk,
+	      &knsupc, &beta, rtemp, &nbrow );
+#elif defined (USE_VENDOR_BLAS)
+	sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow, 1, 1 );
+#else
+	sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+	       &alpha, &lusup[luptr], &nsupr, xk,
+	       &knsupc, &beta, rtemp, &nbrow );
+#endif
+	stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs;
+
+	lk = LBi( ik, grid ); /* Local block number, row-wise. */
+	iknsupc = SuperSize( ik );
+	il = LSUM_BLK( lk );
+	dest = &lsum[il];
+	lptr += LB_DESCRIPTOR;
+	rel = xsup[ik]; /* Global row index of block ik. */
+	for (i = 0; i < nbrow; ++i) {
+	    irow = lsub[lptr++] - rel; /* Relative row. */
+	    RHS_ITERATE(j)
+		dest[irow + j*iknsupc] -= rtemp[i + j*nbrow];
+	}
+	luptr += nbrow;
+
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[SOL_GEMM] += t2;
+#endif
+
+	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
+	    ikcol = PCOL( ik, grid );
+	    p = PNUM( myrow, ikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_FLOAT, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_FLOAT, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			 MPI_FLOAT, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( lk );
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
+		if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
+		    fmod[lk] = -1; /* Do not solve X[k] in the future. */
+		    lk = LBj( ik, grid );/* Local block number, column-wise. */
+		    lsub1 = Llu->Lrowind_bc_ptr[lk];
+		    lusup1 = Llu->Lnzval_bc_ptr[lk];
+		    nsupr1 = lsub1[1];
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+			  lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+			   lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+			   lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[SOL_TRSM] += t2;
+#endif
+
+		    stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( fsendx_plist[lk][p] != EMPTY ) {
+			    pi = PNUM( p, ikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_FLOAT, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_FLOAT, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                    }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    nlb1 = lsub1[0] - 1;
+		    lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc;
+		    luptr1 = iknsupc; /* Skip diagonal block L(I,I). */
+
+		    slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
+			       fmod, nlb1, lptr1, luptr1, xsup,
+			       grid, Llu, send_req, stat);
+		} /* if frecv[lk] == 0 */
+	    } /* if iam == p */
+	} /* if fmod[lk] == 0 */
+
+    } /* for lb ... */
+
+} /* sLSUM_FMOD */
+
+
+/************************************************************************/
+void slsum_bmod
+/************************************************************************/
+(
+ float *lsum,        /* Sum of local modifications.                    */
+ float *x,           /* X array (local).                               */
+ float *xk,          /* X[k].                                          */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ MPI_Request send_req[], /* input/output */
+ SuperLUStat_t *stat
+ )
+{
+/*
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+ */
+    float alpha = 1.0, beta = 0.0;
+    int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+    int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+           j, jj, lk, lk1, nub, ub, uptr;
+    int_t  *usub;
+    float *uval, *dest, *y;
+    int_t  *lsub;
+    float *lusup;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *brecv = Llu->brecv;
+    int_t  **bsendx_plist = Llu->bsendx_plist;
+    MPI_Status status;
+    int test_flag;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    knsupc = SuperSize( k );
+    lk = LBj( k, grid ); /* Local block number, column-wise. */
+    nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
+    for (ub = 0; ub < nub; ++ub) {
+	ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+	usub = Llu->Ufstnz_br_ptr[ik];
+	uval = Llu->Unzval_br_ptr[ik];
+	i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+	i += UB_DESCRIPTOR;
+	il = LSUM_BLK( ik );
+	gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+	iknsupc = SuperSize( gik );
+	ikfrow = FstBlockC( gik );
+	iklrow = FstBlockC( gik+1 );
+
+	RHS_ITERATE(j) {
+	    dest = &lsum[il + j*iknsupc];
+	    y = &xk[j*knsupc];
+	    uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+	    for (jj = 0; jj < knsupc; ++jj) {
+		fnz = usub[i + jj];
+		if ( fnz < iklrow ) { /* Nonzero segment. */
+		    /* AXPY */
+		    for (irow = fnz; irow < iklrow; ++irow)
+			dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+		    stat->ops[SOLVE] += 2 * (iklrow - fnz);
+		}
+	    } /* for jj ... */
+	}
+
+	if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */
+	    gikcol = PCOL( gik, grid );
+	    p = PNUM( myrow, gikcol, grid );
+	    if ( iam != p ) {
+#ifdef ISEND_IRECV
+		MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_FLOAT, p, LSUM, grid->comm,
+                           &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+		MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			   MPI_FLOAT, p, LSUM, grid->comm );
+#else
+		MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H,
+			  MPI_FLOAT, p, LSUM, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+		       iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+	    } else { /* Diagonal process: X[i] += lsum[i]. */
+		ii = X_BLK( ik );
+		dest = &x[ii];
+		RHS_ITERATE(j)
+		    for (i = 0; i < iknsupc; ++i)
+			dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
+		if ( !brecv[ik] ) { /* Becomes a leaf node. */
+		    bmod[ik] = -1; /* Do not solve X[k] in the future. */
+		    lk1 = LBj( gik, grid ); /* Local block number. */
+		    lsub = Llu->Lrowind_bc_ptr[lk1];
+		    lusup = Llu->Lnzval_bc_ptr[lk1];
+		    nsupr = lsub[1];
+#ifdef _CRAY
+		    STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+			  lusup, &nsupr, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+		    strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+		    strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+			   lusup, &nsupr, &x[ii], &iknsupc);
+#endif
+		    stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+		    printf("(%2d) Solve X[%2d]\n", iam, gik);
+#endif
+
+		    /*
+		     * Send Xk to process column Pc[k].
+		     */
+		    for (p = 0; p < grid->nprow; ++p) {
+			if ( bsendx_plist[lk1][p] != EMPTY ) {
+			    pi = PNUM( p, gikcol, grid );
+#ifdef ISEND_IRECV
+			    MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_FLOAT, pi, Xk, grid->comm,
+				       &send_req[Llu->SolveMsgSent++] );
+#else
+#ifdef BSEND
+			    MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				       MPI_FLOAT, pi, Xk, grid->comm );
+#else
+			    MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H,
+				     MPI_FLOAT, pi, Xk, grid->comm );
+#endif
+#endif
+#if ( DEBUGlevel>=2 )
+			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
+				   iam, x[ii-XK_H], pi);
+#endif
+			}
+                     }
+		    /*
+		     * Perform local block modifications.
+		     */
+		    if ( Urbs[lk1] )
+			slsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs,
+				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+				   send_req, stat);
+		} /* if brecv[ik] == 0 */
+	    }
+	} /* if bmod[ik] == 0 */
+
+    } /* for ub ... */
+
+} /* slSUM_BMOD */
+
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+void slsum_fmod_inv
+/************************************************************************/
+(
+ float *lsum,    /* Sum of local modifications.                        */
+ float *x,       /* X array (local)                                    */
+ float *xk,      /* X[k].                                              */
+ float *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ SuperLUStat_t **stat,
+ int_t *leaf_send,
+ int_t *nleaf_send,
+ int_t sizelsum,
+ int_t sizertemp,
+ int_t recurlevel,
+ int_t maxsuper,
+ int thread_id,
+ int num_thread
+)
+{
+    float alpha = 1.0, beta = 0.0,malpha=-1.0;
+    float *lusup, *lusup1;
+    float *dest;
+	float *Linv;/* Inverse of diagonal block */
+	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m;
+	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
+	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
+	int thread_id1;
+	flops_t ops_loc=0.0;
+    MPI_Status status;
+    int test_flag;
+	yes_no_t done;
+	BcTree  *LBtree_ptr = Llu->LBtree_ptr;
+	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+	int_t* idx_lsum,idx_lsum1;
+	float *rtemp_loc;
+	int_t ldalsum;
+	int_t nleaf_send_tmp;
+	int_t lptr;      /* Starting position in lsub[*].                      */
+	int_t luptr;     /* Starting position in lusup[*].                     */
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (float);
+	int_t aln_d,aln_i;
+	aln_d = ceil(CACHELINE/(double)dword);
+	aln_i = ceil(CACHELINE/(double)iword);
+	int   knsupc;    /* Size of supernode k.                               */
+	int_t nlb;       /* Number of L blocks.                                */
+
+
+	knsupc = SuperSize( k );
+
+	lk = LBj( k, grid ); /* Local block number, column-wise. */
+	lsub = Llu->Lrowind_bc_ptr[lk];
+	nlb = lsub[0] - 1;
+
+
+	ldalsum=Llu->ldalsum;
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
+
+	// #if ( PROFlevel>=1 )
+	double t1, t2, t3, t4;
+	float msg_vol = 0, msg_cnt = 0;
+	// #endif
+
+	if(nlb>0){
+
+		iam = grid->iam;
+		myrow = MYROW( iam, grid );
+
+		lusup = Llu->Lnzval_bc_ptr[lk];
+		lloc = Llu->Lindval_loc_bc_ptr[lk];
+
+		nsupr = lsub[1];
+
+		// printf("nlb: %5d lk: %5d\n",nlb,lk);
+		// fflush(stdout);
+
+		krow = PROW( k, grid );
+		if(myrow==krow){
+			idx_n = 1;
+			idx_i = nlb+2;
+			idx_v = 2*nlb+3;
+			luptr_tmp = lloc[idx_v];
+			m = nsupr-knsupc;
+		}else{
+			idx_n = 0;
+			idx_i = nlb;
+			idx_v = 2*nlb;
+			luptr_tmp = lloc[idx_v];
+			m = nsupr;
+		}
+
+		assert(m>0);
+
+		if(m>8*maxsuper){
+		// if(0){
+
+			// Nchunk=floor(num_thread/2.0)+1;
+			Nchunk=SUPERLU_MIN(num_thread,nlb);
+			// Nchunk=1;
+			nlb_loc = floor(((double)nlb)/Nchunk);
+			remainder = nlb % Nchunk;
+
+#ifdef _OPENMP
+#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup
+#endif
+			for (nn=0;nn<Nchunk;++nn){
+
+#ifdef _OPENMP
+			    thread_id1 = omp_get_thread_num ();
+#else
+			    thread_id1 = 0;
+#endif
+			    rtemp_loc = &rtemp[sizertemp* thread_id1];
+
+				if(nn<remainder){
+					lbstart = nn*(nlb_loc+1);
+					lbend = (nn+1)*(nlb_loc+1);
+				}else{
+					lbstart = remainder+nn*nlb_loc;
+					lbend = remainder + (nn+1)*nlb_loc;
+				}
+
+				if(lbstart<lbend){
+
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+					luptr_tmp1 = lloc[lbstart+idx_v];
+					nbrow=0;
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						nbrow += lsub[lptr1_tmp+1];
+					}
+
+#ifdef _CRAY
+					SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						  &knsupc, &beta, rtemp_loc, &nbrow );
+#elif defined (USE_VENDOR_BLAS)
+					sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						   &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 );
+#else
+					sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						   &knsupc, &beta, rtemp_loc, &nbrow );
+#endif
+
+					nbrow_ref=0;
+					for (lb = lbstart; lb < lbend; ++lb){
+					    lptr1_tmp = lloc[lb+idx_i];
+					    lptr= lptr1_tmp+2;
+					    nbrow1 = lsub[lptr1_tmp+1];
+					    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+					    rel = xsup[ik]; /* Global row index of block ik. */
+
+					    lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+					    iknsupc = SuperSize( ik );
+					    il = LSUM_BLK( lk );
+
+					    RHS_ITERATE(j)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+						for (i = 0; i < nbrow1; ++i) {
+					   	    irow = lsub[lptr+i] - rel; /* Relative row. */
+						    lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+						}
+						nbrow_ref+=nbrow1;
+					} /* endd for lb ... */
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat[thread_id1]->utime[SOL_GEMM] += t2;
+#endif
+
+					for (lb=lbstart;lb<lbend;lb++){
+					    lk = lloc[lb+idx_n];
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+					    fmod_tmp=--fmod[lk*aln_i];
+
+					    if ( fmod_tmp==0 ) { /* Local accumulation done. */
+
+						lptr1_tmp = lloc[lb+idx_i];
+
+						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+						iknsupc = SuperSize( ik );
+						il = LSUM_BLK( lk );
+
+						ikcol = PCOL( ik, grid );
+						p = PNUM( myrow, ikcol, grid );
+						if ( iam != p ) {
+						    for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+							nleaf_send_tmp = ++nleaf_send[0];
+							leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
+							// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'s');
+
+						} else { /* Diagonal process: X[i] += lsum[i]. */
+
+#if ( PROFlevel>=1 )
+							TIC(t1);
+#endif
+							for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+							ii = X_BLK( lk );
+							RHS_ITERATE(j)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (i = 0; i < iknsupc; ++i)
+								x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
+
+							// fmod[lk] = -1; /* Do not solve X[k] in the future. */
+							lk = LBj( ik, grid );/* Local block number, column-wise. */
+							lsub1 = Llu->Lrowind_bc_ptr[lk];
+							lusup1 = Llu->Lnzval_bc_ptr[lk];
+							nsupr1 = lsub1[1];
+
+							if(Llu->inv == 1){
+								Linv = Llu->Linv_bc_ptr[lk];
+
+
+#ifdef _CRAY
+								SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+#elif defined (USE_VENDOR_BLAS)
+								sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+#else
+								sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+							#ifdef _OPENMP
+							#pragma omp simd
+							#endif
+								for (i=0 ; i<iknsupc*nrhs ; i++){
+									x[ii+i] = rtemp_loc[i];
+								}
+
+							}else{
+#ifdef _CRAY
+								STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+											lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+								strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+								strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc);
+
+#endif
+							} /* end else */
+
+#if ( PROFlevel>=1 )
+							TOC(t2, t1);
+							stat[thread_id1]->utime[SOL_TRSM] += t2;
+
+#endif
+
+							stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+							printf("(%2d) Solve X[%2d]\n", iam, ik);
+
+#endif
+
+							/*
+							 * Send Xk to process column Pc[k].
+							 */
+
+							if(LBtree_ptr[lk]!=NULL){
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+								nleaf_send_tmp = ++nleaf_send[0];
+								leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+							}
+
+							/*
+							 * Perform local block modifications.
+							 */
+
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
+							{
+
+								slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+										fmod, xsup,
+										grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
+							}
+
+							// } /* if frecv[lk] == 0 */
+						} /* end if iam == p */
+					} /* if fmod[lk] == 0 */
+				}
+
+			} /* end tasklook for nn ... */
+		}
+
+		}else{
+
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+
+#ifdef _CRAY
+			SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m );
+#elif defined (USE_VENDOR_BLAS)
+			sgemm_( "N", "N", &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m, 1, 1 );
+#else
+			sgemm_( "N", "N", &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m );
+#endif
+
+			nbrow=0;
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				nbrow += lsub[lptr1_tmp+1];
+			}
+			nbrow_ref=0;
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
+				nbrow1 = lsub[lptr1_tmp+1];
+				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				rel = xsup[ik]; /* Global row index of block ik. */
+
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+				iknsupc = SuperSize( ik );
+				il = LSUM_BLK( lk );
+
+				RHS_ITERATE(j)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+				    for (i = 0; i < nbrow1; ++i) {
+					irow = lsub[lptr+i] - rel; /* Relative row. */
+
+					lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+				    }
+				nbrow_ref+=nbrow1;
+			} /* end for lb ... */
+
+			// TOC(t3, t1);
+
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat[thread_id]->utime[SOL_GEMM] += t2;
+#endif
+
+			for (lb=0;lb<nlb;lb++){
+				lk = lloc[lb+idx_n];
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+				fmod_tmp=--fmod[lk*aln_i];
+
+				if ( fmod_tmp==0 ) { /* Local accumulation done. */
+
+				    lptr1_tmp = lloc[lb+idx_i];
+
+				    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				    lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+				    iknsupc = SuperSize( ik );
+				    il = LSUM_BLK( lk );
+				    ikcol = PCOL( ik, grid );
+				    p = PNUM( myrow, ikcol, grid );
+				    if ( iam != p ) {
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				    	    for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+					nleaf_send_tmp = ++nleaf_send[0];
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
+
+				    } else { /* Diagonal process: X[i] += lsum[i]. */
+
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+					    for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+					ii = X_BLK( lk );
+					RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				 	    for (i = 0; i < iknsupc; ++i)
+					        x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc];
+
+					lk = LBj( ik, grid );/* Local block number, column-wise. */
+					lsub1 = Llu->Lrowind_bc_ptr[lk];
+					lusup1 = Llu->Lnzval_bc_ptr[lk];
+					nsupr1 = lsub1[1];
+
+					if(Llu->inv == 1){
+					    Linv = Llu->Linv_bc_ptr[lk];
+#ifdef _CRAY
+					    SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
+#elif defined (USE_VENDOR_BLAS)
+					    sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+#else
+					    sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+					#ifdef _OPENMP
+					#pragma omp simd
+					#endif
+					    for (i=0 ; i<iknsupc*nrhs ; i++){
+						x[ii+i] = rtemp_loc[i];
+					    }
+					}else{
+#ifdef _CRAY
+					    STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+					    strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+					    strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+									lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+					} /* end else */
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+
+					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+					printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
+
+					if(LBtree_ptr[lk]!=NULL){
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+						nleaf_send_tmp = ++nleaf_send[0];
+						// printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp);
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s');
+					}
+
+					/*
+					 * Perform local block modifications.
+					 */
+
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
+// #endif
+					{
+						slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+							fmod, xsup,
+							grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
+					}
+
+						// } /* if frecv[lk] == 0 */
+				} /* end else iam == p */
+			} /* if fmod[lk] == 0 */
+		}
+		// }
+}
+
+	stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
+
+
+} /* if nlb>0*/
+} /* sLSUM_FMOD_INV */
+
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * </pre>
+ */
+void slsum_fmod_inv_master
+/************************************************************************/
+(
+ float *lsum,    /* Sum of local modifications.                        */
+ float *x,       /* X array (local)                                    */
+ float *xk,      /* X[k].                                              */
+ float *rtemp,   /* Result of full matrix-vector multiply.             */
+ int   nrhs,      /* Number of right-hand sides.                        */
+ int   knsupc,    /* Size of supernode k.                               */
+ int_t k,         /* The k-th component of X.                           */
+ int_t *fmod,     /* Modification count for L-solve.                    */
+ int_t nlb,       /* Number of L blocks.                                */
+ int_t *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ SuperLUStat_t **stat,
+ int_t sizelsum,
+ int_t sizertemp,
+ int_t recurlevel,
+ int_t maxsuper,
+ int thread_id,
+ int num_thread
+)
+{
+    float alpha = 1.0, beta = 0.0, malpha=-1.0;
+    float *lusup, *lusup1;
+    float *dest;
+	float *Linv;/* Inverse of diagonal block */
+	int    iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r;
+	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
+	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
+    int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+    int_t  *frecv = Llu->frecv;
+    int_t  **fsendx_plist = Llu->fsendx_plist;
+	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
+	int thread_id1;
+	int m;
+	flops_t ops_loc=0.0;
+    MPI_Status status;
+    int test_flag;
+	yes_no_t done;
+	BcTree  *LBtree_ptr = Llu->LBtree_ptr;
+	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+	int_t* idx_lsum,idx_lsum1;
+	float *rtemp_loc;
+	int_t ldalsum;
+	int_t nleaf_send_tmp;
+	int_t lptr;      /* Starting position in lsub[*].                      */
+	int_t luptr;     /* Starting position in lusup[*].                     */
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (float);
+	int_t aln_d,aln_i;
+	aln_d = ceil(CACHELINE/(double)dword);
+	aln_i = ceil(CACHELINE/(double)iword);
+
+	ldalsum=Llu->ldalsum;
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
+
+	// #if ( PROFlevel>=1 )
+	double t1, t2, t3, t4;
+	float msg_vol = 0, msg_cnt = 0;
+	// #endif
+
+	if(nlb>0){
+
+		iam = grid->iam;
+		myrow = MYROW( iam, grid );
+		lk = LBj( k, grid ); /* Local block number, column-wise. */
+
+		// printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk);
+		// fflush(stdout);
+
+		lsub = Llu->Lrowind_bc_ptr[lk];
+
+		// printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk);
+		// fflush(stdout);
+
+		lusup = Llu->Lnzval_bc_ptr[lk];
+		lloc = Llu->Lindval_loc_bc_ptr[lk];
+		// idx_lsum = Llu->Lrowind_bc_2_lsum[lk];
+
+		nsupr = lsub[1];
+
+		// printf("nlb: %5d lk: %5d\n",nlb,lk);
+		// fflush(stdout);
+
+		krow = PROW( k, grid );
+		if(myrow==krow){
+			idx_n = 1;
+			idx_i = nlb+2;
+			idx_v = 2*nlb+3;
+			luptr_tmp = lloc[idx_v];
+			m = nsupr-knsupc;
+		}else{
+			idx_n = 0;
+			idx_i = nlb;
+			idx_v = 2*nlb;
+			luptr_tmp = lloc[idx_v];
+			m = nsupr;
+		}
+
+		assert(m>0);
+
+		if(m>4*maxsuper || nrhs>10){
+			// if(m<1){
+			// TIC(t1);
+			Nchunk=num_thread;
+			nlb_loc = floor(((double)nlb)/Nchunk);
+			remainder = nlb % Nchunk;
+
+#ifdef _OPENMP
+#pragma	omp	taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied
+#endif
+			for (nn=0;nn<Nchunk;++nn){
+
+#ifdef _OPENMP
+				thread_id1 = omp_get_thread_num ();
+#else
+				thread_id1 = 0;
+#endif
+				rtemp_loc = &rtemp[sizertemp* thread_id1];
+
+				if(nn<remainder){
+					lbstart = nn*(nlb_loc+1);
+					lbend = (nn+1)*(nlb_loc+1);
+				}else{
+					lbstart = remainder+nn*nlb_loc;
+					lbend = remainder + (nn+1)*nlb_loc;
+				}
+
+				if(lbstart<lbend){
+
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+					luptr_tmp1 = lloc[lbstart+idx_v];
+					nbrow=0;
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						nbrow += lsub[lptr1_tmp+1];
+					}
+
+				#ifdef _CRAY
+					SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
+						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						  &knsupc, &beta, rtemp_loc, &nbrow );
+				#elif defined (USE_VENDOR_BLAS)
+					sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						   &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 );
+				#else
+					sgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
+						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
+						   &knsupc, &beta, rtemp_loc, &nbrow );
+				#endif
+
+					nbrow_ref=0;
+					for (lb = lbstart; lb < lbend; ++lb){
+						lptr1_tmp = lloc[lb+idx_i];
+						lptr= lptr1_tmp+2;
+						nbrow1 = lsub[lptr1_tmp+1];
+						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+						rel = xsup[ik]; /* Global row index of block ik. */
+
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+						iknsupc = SuperSize( ik );
+						il = LSUM_BLK( lk );
+
+						RHS_ITERATE(j)
+					#ifdef _OPENMP
+					#pragma omp simd lastprivate(irow)
+					#endif
+							for (i = 0; i < nbrow1; ++i) {
+								irow = lsub[lptr+i] - rel; /* Relative row. */
+								lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+							}
+						nbrow_ref+=nbrow1;
+					} /* end for lb ... */
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat[thread_id1]->utime[SOL_GEMM] += t2;
+#endif
+			} /* end if (lbstart<lbend) ... */
+
+		} /* end taskloop for nn = ... */
+
+		}else{
+
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+
+#ifdef _CRAY
+			SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m );
+#elif defined (USE_VENDOR_BLAS)
+			sgemm_( "N", "N", &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m, 1, 1 );
+#else
+			sgemm_( "N", "N", &m, &nrhs, &knsupc,
+					&alpha, &lusup[luptr_tmp], &nsupr, xk,
+					&knsupc, &beta, rtemp_loc, &m );
+#endif
+
+			nbrow=0;
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				nbrow += lsub[lptr1_tmp+1];
+			}
+			nbrow_ref=0;
+			for (lb = 0; lb < nlb; ++lb){
+				lptr1_tmp = lloc[lb+idx_i];
+				lptr= lptr1_tmp+2;
+				nbrow1 = lsub[lptr1_tmp+1];
+				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				rel = xsup[ik]; /* Global row index of block ik. */
+
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+				iknsupc = SuperSize( ik );
+				il = LSUM_BLK( lk );
+
+				RHS_ITERATE(j)
+			#ifdef _OPENMP
+			#pragma omp simd lastprivate(irow)
+			#endif
+					for (i = 0; i < nbrow1; ++i) {
+						irow = lsub[lptr+i] - rel; /* Relative row. */
+
+						lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow];
+					}
+				nbrow_ref+=nbrow1;
+			} /* end for lb ... */
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat[thread_id]->utime[SOL_GEMM] += t2;
+#endif
+		} /* end else ... */
+		// TOC(t3, t1);
+		rtemp_loc = &rtemp[sizertemp* thread_id];
+
+		for (lb=0;lb<nlb;lb++){
+			lk = lloc[lb+idx_n];
+
+			// #ifdef _OPENMP
+			// #pragma omp atomic capture
+			// #endif
+			fmod_tmp=--fmod[lk*aln_i];
+
+
+			if ( fmod_tmp==0 ) { /* Local accumulation done. */
+				// --fmod[lk];
+
+
+				lptr1_tmp = lloc[lb+idx_i];
+				// luptr_tmp = lloc[lb+idx_v];
+
+				ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				lk = LBi( ik, grid ); /* Local block number, row-wise. */
+
+				iknsupc = SuperSize( ik );
+				il = LSUM_BLK( lk );
+
+				// nbrow = lsub[lptr1_tmp+1];
+
+				ikcol = PCOL( ik, grid );
+				p = PNUM( myrow, ikcol, grid );
+				if ( iam != p ) {
+					// if(frecv[lk]==0){
+					// fmod[lk] = -1;
+
+					for (ii=1;ii<num_thread;ii++)
+						// if(ii!=thread_id)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
+							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+					RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+					// }
+
+
+				} else { /* Diagonal process: X[i] += lsum[i]. */
+
+					// if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+					for (ii=1;ii<num_thread;ii++)
+						// if(ii!=thread_id)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
+							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+					ii = X_BLK( lk );
+					// for (jj=0;jj<num_thread;jj++)
+					RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+						for (i = 0; i < iknsupc; ++i)
+							x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc ];
+
+					// fmod[lk] = -1; /* Do not solve X[k] in the future. */
+					lk = LBj( ik, grid );/* Local block number, column-wise. */
+					lsub1 = Llu->Lrowind_bc_ptr[lk];
+					lusup1 = Llu->Lnzval_bc_ptr[lk];
+					nsupr1 = lsub1[1];
+
+					if(Llu->inv == 1){
+						Linv = Llu->Linv_bc_ptr[lk];
+#ifdef _CRAY
+						SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+								&alpha, Linv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc );
+#elif defined (USE_VENDOR_BLAS)
+						sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+								&alpha, Linv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+#else
+						sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+								&alpha, Linv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+						for (i=0 ; i<iknsupc*nrhs ; i++){
+							x[ii+i] = rtemp_loc[i];
+						}
+					}else{
+#ifdef _CRAY
+						STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+								lusup1, &nsupr1, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+						strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+								lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+						strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+								lusup1, &nsupr1, &x[ii], &iknsupc);
+#endif
+					}
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+
+					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs;
+
+#if ( DEBUGlevel>=2 )
+					printf("(%2d) Solve X[%2d]\n", iam, ik);
+#endif
+
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
+
+					if(LBtree_ptr[lk]!=NULL)
+						BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+
+					/*
+					 * Perform local block modifications.
+					 */
+
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
+					{
+						nlb1 = lsub1[0] - 1;
+
+						slsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
+								fmod, nlb1, xsup,
+								grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
+					}
+
+					// } /* if frecv[lk] == 0 */
+				} /* if iam == p */
+			} /* if fmod[lk] == 0 */
+		}
+		// }
+		stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc;
+	} /* end if nlb>0*/
+} /* end slsum_fmod_inv_master */
+
+
+
+/************************************************************************/
+void slsum_bmod_inv
+/************************************************************************/
+(
+ float *lsum,        /* Sum of local modifications.                    */
+ float *x,           /* X array (local).                               */
+ float *xk,          /* X[k].                                          */
+ float *rtemp,   /* Result of full matrix-vector multiply.             */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ SuperLUStat_t **stat,
+ int_t* root_send,
+ int_t* nroot_send,
+ int_t sizelsum,
+ int_t sizertemp,
+ int thread_id,
+ int num_thread
+ )
+{
+	/*
+	 * Purpose
+	 * =======
+	 *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+	 */
+    float alpha = 1.0, beta = 0.0;
+	int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+	int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+	       j, jj, lk, lk1, nub, ub, uptr;
+	int_t  *usub;
+	float *uval, *dest, *y;
+	int_t  *lsub;
+	float *lusup;
+	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+	int_t  *brecv = Llu->brecv;
+	int_t  **bsendx_plist = Llu->bsendx_plist;
+	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
+	MPI_Status status;
+	int test_flag;
+	int_t bmod_tmp;
+	int thread_id1;
+	float *rtemp_loc;
+	int_t nroot_send_tmp;
+	float *Uinv;/* Inverse of diagonal block */
+	float temp;
+	double t1, t2;
+	float msg_vol = 0, msg_cnt = 0;
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof(float);
+	int_t aln_d,aln_i;
+	aln_d = ceil(CACHELINE/(double)dword);
+	aln_i = ceil(CACHELINE/(double)iword);
+
+
+	iam = grid->iam;
+	myrow = MYROW( iam, grid );
+	knsupc = SuperSize( k );
+	lk = LBj( k, grid ); /* Local block number, column-wise. */
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
+	if(Llu->Unnz[lk]>knsupc*64 || nub>16){
+	// if(nub>num_thread){
+	// if(nub>16){
+	// // // // if(Urbs2[lk]>num_thread){
+	// if(Urbs2[lk]>0){
+		Nchunk=SUPERLU_MIN(num_thread,nub);
+		nub_loc = floor(((double)nub)/Nchunk);
+		remainder = nub % Nchunk;
+		// printf("Unnz: %5d nub: %5d knsupc: %5d\n",Llu->Unnz[lk],nub,knsupc);
+#ifdef _OPENMP
+#pragma	omp	taskloop firstprivate (stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup
+#endif
+		for (nn=0;nn<Nchunk;++nn){
+
+#ifdef _OPENMP
+			thread_id1 = omp_get_thread_num ();
+#else
+			thread_id1 = 0;
+#endif
+			rtemp_loc = &rtemp[sizertemp* thread_id1];
+
+			if(nn<remainder){
+				lbstart = nn*(nub_loc+1);
+				lbend = (nn+1)*(nub_loc+1);
+			}else{
+				lbstart = remainder+nn*nub_loc;
+				lbend = remainder + (nn+1)*nub_loc;
+			}
+			for (ub = lbstart; ub < lbend; ++ub){
+				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+				usub = Llu->Ufstnz_br_ptr[ik];
+				uval = Llu->Unzval_br_ptr[ik];
+				i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+				i += UB_DESCRIPTOR;
+				il = LSUM_BLK( ik );
+				gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+				iknsupc = SuperSize( gik );
+				ikfrow = FstBlockC( gik );
+				iklrow = FstBlockC( gik+1 );
+
+#if ( PROFlevel>=1 )
+				TIC(t1);
+#endif
+
+				RHS_ITERATE(j) {
+					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
+					y = &xk[j*knsupc];
+					uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+					for (jj = 0; jj < knsupc; ++jj) {
+						fnz = usub[i + jj];
+						if ( fnz < iklrow ) { /* Nonzero segment. */
+							/* AXPY */
+//#ifdef _OPENMP  
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
+							for (irow = fnz; irow < iklrow; ++irow)
+								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+								stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
+
+						}
+					} /* end for jj ... */
+				}
+
+#if ( PROFlevel>=1 )
+				TOC(t2, t1);
+				stat[thread_id1]->utime[SOL_GEMM] += t2;
+#endif
+
+		#ifdef _OPENMP
+		#pragma omp atomic capture
+		#endif
+				bmod_tmp=--bmod[ik*aln_i];
+
+				if ( bmod_tmp == 0 ) { /* Local accumulation done. */
+					gikcol = PCOL( gik, grid );
+					p = PNUM( myrow, gikcol, grid );
+					if ( iam != p ) {
+						for (ii=1;ii<num_thread;ii++)
+							// if(ii!=thread_id1)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+							for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+						nroot_send_tmp = ++nroot_send[0];
+						root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
+						// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'s');
+
+		#if ( DEBUGlevel>=2 )
+						printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+								iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+		#endif
+					} else { /* Diagonal process: X[i] += lsum[i]. */
+
+#if ( PROFlevel>=1 )
+						TIC(t1);
+#endif
+						for (ii=1;ii<num_thread;ii++)
+							// if(ii!=thread_id1)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+							for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+						ii = X_BLK( ik );
+						dest = &x[ii];
+
+						RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+							for (i = 0; i < iknsupc; ++i)
+							    dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
+
+						// if ( !brecv[ik] ) { /* Becomes a leaf node. */
+							// bmod[ik] = -1; /* Do not solve X[k] in the future. */
+							lk1 = LBj( gik, grid ); /* Local block number. */
+							lsub = Llu->Lrowind_bc_ptr[lk1];
+							lusup = Llu->Lnzval_bc_ptr[lk1];
+							nsupr = lsub[1];
+
+							if(Llu->inv == 1){
+								Uinv = Llu->Uinv_bc_ptr[lk1];
+		#ifdef _CRAY
+								SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+										&alpha, Uinv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+		#elif defined (USE_VENDOR_BLAS)
+								sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Uinv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+		#else
+								sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Uinv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+		#endif
+								#ifdef _OPENMP
+								#pragma omp simd
+								#endif
+								for (i=0 ; i<iknsupc*nrhs ; i++){
+									x[ii+i] = rtemp_loc[i];
+								}
+							}else{
+		#ifdef _CRAY
+								STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+										lusup, &nsupr, &x[ii], &iknsupc);
+		#elif defined (USE_VENDOR_BLAS)
+								strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+										lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+		#else
+								strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+										lusup, &nsupr, &x[ii], &iknsupc);
+		#endif
+							}
+
+		#if ( PROFlevel>=1 )
+							TOC(t2, t1);
+							stat[thread_id1]->utime[SOL_TRSM] += t2;
+		#endif
+							stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+
+		#if ( DEBUGlevel>=2 )
+							printf("(%2d) Solve X[%2d]\n", iam, gik);
+		#endif
+
+							/*
+							 * Send Xk to process column Pc[k].
+							 */
+
+							 // for (i=0 ; i<iknsupc*nrhs ; i++){
+								// printf("xre: %f\n",x[ii+i]);
+								// fflush(stdout);
+							// }
+							if(UBtree_ptr[lk1]!=NULL){
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+							    nroot_send_tmp = ++nroot_send[0];
+							    root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'s');
+							}
+
+							/*
+							 * Perform local block modifications.
+							 */
+							if ( Urbs[lk1] ){
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
+								{
+								slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id1,num_thread);
+								}
+							}
+						// } /* if brecv[ik] == 0 */
+					}
+				} /* if bmod[ik] == 0 */
+			} /* end for ub = ... */
+		} /* end for taskloop nn = ... */
+
+	} else {
+
+		rtemp_loc = &rtemp[sizertemp* thread_id];
+
+		for (ub = 0; ub < nub; ++ub) {
+			ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+			usub = Llu->Ufstnz_br_ptr[ik];
+			uval = Llu->Unzval_br_ptr[ik];
+			i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			il = LSUM_BLK( ik );
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iknsupc = SuperSize( gik );
+			ikfrow = FstBlockC( gik );
+			iklrow = FstBlockC( gik+1 );
+
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+			RHS_ITERATE(j) {
+				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
+				y = &xk[j*knsupc];
+				uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+				for (jj = 0; jj < knsupc; ++jj) {
+					fnz = usub[i + jj];
+					if ( fnz < iklrow ) { /* Nonzero segment. */
+						/* AXPY */
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
+						for (irow = fnz; irow < iklrow; ++irow)
+
+							dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+							stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
+					}
+				} /* for jj ... */
+			}
+
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat[thread_id]->utime[SOL_GEMM] += t2;
+#endif
+
+	#ifdef _OPENMP
+	#pragma omp atomic capture
+	#endif
+			bmod_tmp=--bmod[ik*aln_i];
+
+			if ( bmod_tmp == 0 ) { /* Local accumulation done. */
+				gikcol = PCOL( gik, grid );
+				p = PNUM( myrow, gikcol, grid );
+				if ( iam != p ) {
+					for (ii=1;ii<num_thread;ii++)
+						// if(ii!=thread_id)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
+							lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+					nroot_send_tmp = ++nroot_send[0];
+					root_send[(nroot_send_tmp-1)*aln_i] = -ik-1;
+					// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],'s');
+
+	#if ( DEBUGlevel>=2 )
+					printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+							iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+	#endif
+				} else { /* Diagonal process: X[i] += lsum[i]. */
+
+#if ( PROFlevel>=1 )
+					TIC(t1);
+#endif
+
+					for (ii=1;ii<num_thread;ii++)
+						// if(ii!=thread_id)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+						for (jj=0;jj<iknsupc*nrhs;jj++)
+								lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+					ii = X_BLK( ik );
+					dest = &x[ii];
+
+					RHS_ITERATE(j)
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+						for (i = 0; i < iknsupc; ++i)
+							dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
+
+					// if ( !brecv[ik] ) { /* Becomes a leaf node. */
+						// bmod[ik] = -1; /* Do not solve X[k] in the future. */
+						lk1 = LBj( gik, grid ); /* Local block number. */
+						lsub = Llu->Lrowind_bc_ptr[lk1];
+						lusup = Llu->Lnzval_bc_ptr[lk1];
+						nsupr = lsub[1];
+
+						if(Llu->inv == 1){
+							Uinv = Llu->Uinv_bc_ptr[lk1];
+	#ifdef _CRAY
+							SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+									&alpha, Uinv, &iknsupc, &x[ii],
+									&iknsupc, &beta, rtemp_loc, &iknsupc );
+	#elif defined (USE_VENDOR_BLAS)
+							sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+									&alpha, Uinv, &iknsupc, &x[ii],
+									&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+	#else
+							sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+									&alpha, Uinv, &iknsupc, &x[ii],
+									&iknsupc, &beta, rtemp_loc, &iknsupc );
+	#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+							for (i=0 ; i<iknsupc*nrhs ; i++){
+								x[ii+i] = rtemp_loc[i];
+							}
+						}else{
+	#ifdef _CRAY
+							STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &iknsupc);
+	#elif defined (USE_VENDOR_BLAS)
+							strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+	#else
+							strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+									lusup, &nsupr, &x[ii], &iknsupc);
+	#endif
+						}
+
+	#if ( PROFlevel>=1 )
+						TOC(t2, t1);
+						stat[thread_id]->utime[SOL_TRSM] += t2;
+	#endif
+						stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+	#if ( DEBUGlevel>=2 )
+						printf("(%2d) Solve X[%2d]\n", iam, gik);
+	#endif
+
+						/*
+						 * Send Xk to process column Pc[k].
+						 */
+
+						 // for (i=0 ; i<iknsupc*nrhs ; i++){
+							// printf("xre: %f\n",x[ii+i]);
+							// fflush(stdout);
+						// }
+						if(UBtree_ptr[lk1]!=NULL){
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+						nroot_send_tmp = ++nroot_send[0];
+						root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+						// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'s');
+						}
+
+						/*
+						 * Perform local block modifications.
+						 */
+						if ( Urbs[lk1] )
+
+// if(Urbs[lk1]>16){
+// #ifdef _OPENMP
+// #pragma omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
+							// 	slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+									//	Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+									//	stat, root_send, nroot_send, sizelsum,sizertemp);
+							//}else{
+							slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+								stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+							//}
+
+					// } /* if brecv[ik] == 0 */
+				}
+			} /* if bmod[ik] == 0 */
+
+		} /* end for ub ... */
+	} /* end else ... */
+
+} /* slSUM_BMOD_inv */
+
+
+
+/************************************************************************/
+void slsum_bmod_inv_master
+/************************************************************************/
+(
+ float *lsum,        /* Sum of local modifications.                    */
+ float *x,           /* X array (local).                               */
+ float *xk,          /* X[k].                                          */
+ float *rtemp,   /* Result of full matrix-vector multiply.             */
+ int    nrhs,	      /* Number of right-hand sides.                    */
+ int_t  k,            /* The k-th component of X.                       */
+ int_t  *bmod,        /* Modification count for L-solve.                */
+ int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
+ Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
+ int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
+ int_t  *xsup,
+ gridinfo_t *grid,
+ sLocalLU_t *Llu,
+ SuperLUStat_t **stat,
+ int_t sizelsum,
+ int_t sizertemp,
+ int thread_id,
+ int num_thread
+ )
+{
+	/*
+	 * Purpose
+	 * =======
+	 *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
+	 */
+    float alpha = 1.0, beta = 0.0;
+	int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
+	int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
+	       j, jj, lk, lk1, nub, ub, uptr;
+	int_t  *usub;
+	float *uval, *dest, *y;
+	int_t  *lsub;
+	float *lusup;
+	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
+	int_t  *brecv = Llu->brecv;
+	int_t  **bsendx_plist = Llu->bsendx_plist;
+	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
+	RdTree  *URtree_ptr = Llu->URtree_ptr;
+	MPI_Status status;
+	int test_flag;
+	int_t bmod_tmp;
+	int thread_id1;
+	float *rtemp_loc;
+	float temp;
+	float *Uinv;/* Inverse of diagonal block */
+
+	double t1, t2;
+	float msg_vol = 0, msg_cnt = 0;
+	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
+	int_t iword = sizeof(int_t);
+	int_t dword = sizeof (float);
+	int_t aln_d,aln_i;
+	aln_d = ceil(CACHELINE/(double)dword);
+	aln_i = ceil(CACHELINE/(double)iword);
+
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
+
+
+	iam = grid->iam;
+	myrow = MYROW( iam, grid );
+	knsupc = SuperSize( k );
+	lk = LBj( k, grid ); /* Local block number, column-wise. */
+	nub = Urbs[lk];      /* Number of U blocks in block column lk */
+
+	// printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub);
+	// fflush(stdout);
+
+	if(nub>num_thread){
+	// if(nub>0){
+		Nchunk=num_thread;
+		nub_loc = floor(((double)nub)/Nchunk);
+		remainder = nub % Nchunk;
+
+//#ifdef _OPENMP
+//#pragma	omp	taskloop firstprivate (stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied
+//#endif
+		for (nn=0;nn<Nchunk;++nn){
+
+#ifdef _OPENMP
+			thread_id1 = omp_get_thread_num ();
+#else
+			thread_id1 = 0;
+#endif
+			rtemp_loc = &rtemp[sizertemp* thread_id1];
+
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif
+
+			if(nn<remainder){
+				lbstart = nn*(nub_loc+1);
+				lbend = (nn+1)*(nub_loc+1);
+			}else{
+				lbstart = remainder+nn*nub_loc;
+				lbend = remainder + (nn+1)*nub_loc;
+			}
+			for (ub = lbstart; ub < lbend; ++ub){
+				ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+				usub = Llu->Ufstnz_br_ptr[ik];
+				uval = Llu->Unzval_br_ptr[ik];
+				i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+				i += UB_DESCRIPTOR;
+				il = LSUM_BLK( ik );
+				gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+				iknsupc = SuperSize( gik );
+				ikfrow = FstBlockC( gik );
+				iklrow = FstBlockC( gik+1 );
+
+				RHS_ITERATE(j) {
+					dest = &lsum[il + j*iknsupc+sizelsum*thread_id1];
+					y = &xk[j*knsupc];
+					uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+					for (jj = 0; jj < knsupc; ++jj) {
+						fnz = usub[i + jj];
+						if ( fnz < iklrow ) { /* Nonzero segment. */
+							/* AXPY */
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
+							for (irow = fnz; irow < iklrow; ++irow)
+								dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+							stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz);
+
+						}
+					} /* for jj ... */
+				}
+			}
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat[thread_id1]->utime[SOL_GEMM] += t2;
+#endif
+		}
+
+	}else{
+		rtemp_loc = &rtemp[sizertemp* thread_id];
+#if ( PROFlevel>=1 )
+		TIC(t1);
+#endif
+		for (ub = 0; ub < nub; ++ub) {
+			ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+			usub = Llu->Ufstnz_br_ptr[ik];
+			uval = Llu->Unzval_br_ptr[ik];
+			i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			il = LSUM_BLK( ik );
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iknsupc = SuperSize( gik );
+			ikfrow = FstBlockC( gik );
+			iklrow = FstBlockC( gik+1 );
+
+			RHS_ITERATE(j) {
+				dest = &lsum[il + j*iknsupc+sizelsum*thread_id];
+				y = &xk[j*knsupc];
+				uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+				for (jj = 0; jj < knsupc; ++jj) {
+					fnz = usub[i + jj];
+					if ( fnz < iklrow ) { /* Nonzero segment. */
+						/* AXPY */
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
+						for (irow = fnz; irow < iklrow; ++irow)
+							dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+						stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz);
+
+					}
+				} /* for jj ... */
+			}
+		}
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat[thread_id]->utime[SOL_GEMM] += t2;
+#endif
+	}
+
+
+	rtemp_loc = &rtemp[sizertemp* thread_id];
+	for (ub = 0; ub < nub; ++ub){
+		ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+		il = LSUM_BLK( ik );
+		gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+		iknsupc = SuperSize( gik );
+
+	// #ifdef _OPENMP
+	// #pragma omp atomic capture
+	// #endif
+		bmod_tmp=--bmod[ik*aln_i];
+
+		if ( bmod_tmp == 0 ) { /* Local accumulation done. */
+			gikcol = PCOL( gik, grid );
+			p = PNUM( myrow, gikcol, grid );
+			if ( iam != p ) {
+				for (ii=1;ii<num_thread;ii++)
+					// if(ii!=thread_id)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+					for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+				RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[ik],'s')*nrhs+LSUM_H,'s');
+
+#if ( DEBUGlevel>=2 )
+				printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
+						iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p);
+#endif
+			} else { /* Diagonal process: X[i] += lsum[i]. */
+
+#if ( PROFlevel>=1 )
+				TIC(t1);
+#endif
+				for (ii=1;ii<num_thread;ii++)
+					// if(ii!=thread_id)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+					for (jj=0;jj<iknsupc*nrhs;jj++)
+						lsum[il + jj ] += lsum[il + jj + ii*sizelsum];
+
+				ii = X_BLK( ik );
+				dest = &x[ii];
+
+				RHS_ITERATE(j)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+					for (i = 0; i < iknsupc; ++i)
+						dest[i + j*iknsupc] += lsum[i + il + j*iknsupc];
+
+				// if ( !brecv[ik] ) { /* Becomes a leaf node. */
+					// bmod[ik] = -1; /* Do not solve X[k] in the future. */
+					lk1 = LBj( gik, grid ); /* Local block number. */
+					lsub = Llu->Lrowind_bc_ptr[lk1];
+					lusup = Llu->Lnzval_bc_ptr[lk1];
+					nsupr = lsub[1];
+
+					if(Llu->inv == 1){
+						Uinv = Llu->Uinv_bc_ptr[lk1];
+#ifdef _CRAY
+						SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+								&alpha, Uinv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc );
+#elif defined (USE_VENDOR_BLAS)
+						sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+								&alpha, Uinv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+#else
+						sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+								&alpha, Uinv, &iknsupc, &x[ii],
+								&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
+						for (i=0 ; i<iknsupc*nrhs ; i++){
+							x[ii+i] = rtemp_loc[i];
+						}
+					}else{
+#ifdef _CRAY
+						STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha,
+								lusup, &nsupr, &x[ii], &iknsupc);
+#elif defined (USE_VENDOR_BLAS)
+						strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+								lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1);
+#else
+						strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
+								lusup, &nsupr, &x[ii], &iknsupc);
+#endif
+					}
+
+#if ( PROFlevel>=1 )
+					TOC(t2, t1);
+					stat[thread_id]->utime[SOL_TRSM] += t2;
+#endif
+					stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+#if ( DEBUGlevel>=2 )
+					printf("(%2d) Solve X[%2d]\n", iam, gik);
+#endif
+
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
+
+					 // for (i=0 ; i<iknsupc*nrhs ; i++){
+						// printf("xre: %f\n",x[ii+i]);
+						// fflush(stdout);
+					// }
+					if(UBtree_ptr[lk1]!=NULL){
+					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'s')*nrhs+XK_H,'s');
+					}
+
+					/*
+					 * Perform local block modifications.
+					 */
+					if ( Urbs[lk1] ){
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
+						{
+						slsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+								stat, sizelsum,sizertemp,thread_id,num_thread);
+						}
+					}
+				// } /* if brecv[ik] == 0 */
+			}
+		} /* if bmod[ik] == 0 */
+	}
+
+} /* slsum_bmod_inv_master */
diff --git a/SRC/pslangs.c b/SRC/pslangs.c
new file mode 100644
index 00000000..27778552
--- /dev/null
+++ b/SRC/pslangs.c
@@ -0,0 +1,145 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest value
+ *
+ * <pre>
+ * File name:	pslangs.c
+ * History:     Modified from lapack routine SLANGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose
+    =======
+
+    PSLANGS returns the value of the one norm, or the Frobenius norm, or
+    the infinity norm, or the element of largest absolute value of a
+    real matrix A.
+
+    Description
+    ===========
+
+    PSLANGE returns the value
+
+       PSLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                 (
+                 ( norm1(A),         NORM = '1', 'O' or 'o'
+                 (
+                 ( normI(A),         NORM = 'I' or 'i'
+                 (
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described above.
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A.
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+   =====================================================================
+</pre>
+*/
+
+float pslangs(char *norm, SuperMatrix *A, gridinfo_t *grid)
+{
+    /* Local variables */
+    NRformat_loc *Astore;
+    int_t    m_loc;
+    float   *Aval;
+    int_t    i, j, jcol;
+    float   value=0., sum;
+    float   *rwork;
+    float   tempvalue;
+    float   *temprwork;
+
+    Astore = (NRformat_loc *) A->Store;
+    m_loc = Astore->m_loc;
+    Aval   = (float *) Astore->nzval;
+
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+    } else if ( strncmp(norm, "M", 1)==0 ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+		value = SUPERLU_MAX( value, fabs(Aval[j]) );
+	}
+
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+#if 0
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
+		sum += fabs(Aval[i]);
+	    value = SUPERLU_MAX(value,sum);
+	}
+#else /* Sherry ==> */
+	if ( !(rwork = floatCalloc_dist(A->ncol)) )
+	    ABORT("floatCalloc_dist fails for rwork.");
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+		rwork[jcol] += fabs(Aval[j]);
+	    }
+	}
+
+	if ( !(temprwork = floatCalloc_dist(A->ncol)) )
+	    ABORT("floatCalloc_dist fails for temprwork.");
+	MPI_Allreduce(rwork, temprwork, A->ncol, MPI_FLOAT, MPI_SUM, grid->comm);
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    value = SUPERLU_MAX(value, temprwork[j]);
+	}
+	SUPERLU_FREE (temprwork);
+	SUPERLU_FREE (rwork);
+#endif
+    } else if ( strncmp(norm, "I", 1)==0 ) {
+	/* Find normI(A). */
+	value = 0.;
+	sum = 0.;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	        sum += fabs(Aval[j]);
+	    value = SUPERLU_MAX(value, sum);
+	}
+	MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm);
+	value = tempvalue;
+
+    } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else {
+	ABORT("Illegal norm specified.");
+    }
+
+    return (value);
+
+} /* pslangs */
diff --git a/SRC/pslaqgs.c b/SRC/pslaqgs.c
new file mode 100644
index 00000000..3fbf9451
--- /dev/null
+++ b/SRC/pslaqgs.c
@@ -0,0 +1,151 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Equilibrates a general sparse M by N matrix
+ *
+ * <pre>
+ * File name:	pslaqgs.c
+ * History:     Modified from LAPACK routine SLAQGE
+ * </pre>
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+
+<pre>
+    Purpose
+    =======
+
+    PSLAQGS equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (input) float*, dimension (A->nrow)
+            The row scale factors for A.
+
+    C       (input) float*, dimension (A->ncol)
+            The column scale factors for A.
+
+    ROWCND  (input) float
+            Ratio of the smallest R(i) to the largest R(i).
+
+    COLCND  (input) float
+            Ratio of the smallest C(i) to the largest C(i).
+
+    AMAX    (input) float
+            Absolute value of largest matrix entry.
+
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.
+            = 'N':  No equilibration
+            = 'R':  Row equilibration, i.e., A has been premultiplied by
+                    diag(R).
+            = 'C':  Column equilibration, i.e., A has been postmultiplied
+                    by diag(C).
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).
+
+    Internal Parameters
+    ===================
+
+    THRESH is a threshold value used to decide if row or column scaling
+    should be done based on the ratio of the row or column scaling
+    factors.  If ROWCND < THRESH, row scaling is done, and if
+    COLCND < THRESH, column scaling is done.
+
+    LARGE and SMALL are threshold values used to decide if row scaling
+    should be done based on the absolute size of the largest matrix
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.
+
+    =====================================================================
+</pre>
+*/
+
+void
+pslaqgs(SuperMatrix *A, float *r, float *c,
+       float rowcnd, float colcnd, float amax, char *equed)
+{
+
+#define THRESH    (0.1)
+
+    /* Local variables */
+    NRformat_loc *Astore;
+    float *Aval;
+    int_t i, j, irow, jcol, m_loc;
+    float large, small;
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    m_loc = Astore->m_loc;
+
+    /* Initialize LARGE and SMALL. */
+    small = smach_dist("Safe minimum") / smach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    irow = Astore->fst_row;
+	    for (i = 0; i < m_loc; ++i) {
+	        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+		    jcol = Astore->colind[j];
+		    Aval[j] *= c[jcol];
+	      }
+	      ++irow;
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
+	        Aval[j] *= r[irow];
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Both row and column scaling */
+	irow = Astore->fst_row;
+	for (i = 0; i < m_loc; ++i) {
+	    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	        jcol = Astore->colind[j];
+	        Aval[j] = Aval[j] * r[irow] * c[jcol];
+	    }
+	    ++irow;
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* pslaqgs */
+
diff --git a/SRC/pssymbfact_distdata.c b/SRC/pssymbfact_distdata.c
new file mode 100644
index 00000000..1488ee18
--- /dev/null
+++ b/SRC/pssymbfact_distdata.c
@@ -0,0 +1,2831 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Redistribute the symbolic structure of L and U from the distribution
+ *
+ * <pre>
+ * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
+ * -- Distributes the data from parallel symbolic factorization
+ * -- to numeric factorization
+ * INRIA France -  July 1, 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ * </pre>
+ */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+
+#include "superlu_sdefs.h"
+#include "psymbfact.h"
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * Redistribute the symbolic structure of L and U from the distribution
+ * used in the parallel symbolic factorization step to the distdibution
+ * used in the parallel numeric factorization step.  On exit, the L and U
+ * structure for the 2D distribution used in the numeric factorization step is
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal
+ * information is also computed and it is stored in Glu_persist->supno
+ * and Glu_persist->xsup.
+ *
+ * This routine allocates memory for storing the structure of L and U
+ * and the supernodes information.  This represents the arrays:
+ * p_xlsub, p_lsub, p_xusub, p_usub,
+ * Glu_persist->supno,  Glu_persist->xsup.
+ *
+ * This routine also deallocates memory allocated during symbolic
+ * factorization routine.  That is, the folloing arrays are freed:
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub,
+ * Pslu_freeable->xusub, Pslu_freeable->usub,
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
+ * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (Input) int_t
+ *        Order of the input matrix
+ * Pslu_freeable  (Input) Pslu_freeable_t *
+ *        Local L and U structure,
+ *        global to local indexing information.
+ *
+ * Glu_persist (Output) Glu_persist_t *
+ *        Stores on output the information on supernodes mapping.
+ *
+ * p_xlsub (Output) int_t **
+ *         Pointer to structure of L distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_lsub  (Output) int_t **
+ *         Structure of L distributed on a 2D grid of processors,
+ *         stored by columns.
+ *
+ * p_xusub (Output) int_t **
+ *         Pointer to structure of U distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_usub  (Output) int_t **
+ *         Structure of U distributed on a 2D grid of processors,
+ *         stored by rows.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU.
+ *   > 0, number of bytes allocated in this routine when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+static float
+dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
+	     Glu_persist_t *Glu_persist,
+	     int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
+	     gridinfo_t *grid
+	     )
+{
+  int   iam, nprocs, pc, pr, p, np, p_diag;
+  int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
+    *tmp_ptrToSend, *mem;
+  int_t *nnzToRecv_l, *nnzToRecv_u;
+  int_t *send_1, *send_2, nsend_1, nsend_2;
+  int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+  int_t nsupers, nsupers_i, nsupers_j;
+  int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
+  int_t maxszsn, maxNvtcsPProc;
+  int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
+  int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+  int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
+  int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
+  int_t *globToLoc, nvtcs_loc;
+  int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
+    RecvCnt_l, RecvCnt_u, ind_loc;
+  int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
+  int_t nelts, isize;
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword;
+
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_symbLU()");
+#endif
+  nprocs = (int) grid->nprow * grid->npcol;
+  xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
+  xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
+  maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
+  globToLoc     = Pslu_freeable->globToLoc;
+  nvtcs_loc     = Pslu_freeable->nvtcs_loc;
+  xsup_beg_s    = Pslu_freeable->xsup_beg_loc;
+  xsup_end_s    = Pslu_freeable->xsup_end_loc;
+  supno_s       = Pslu_freeable->supno_loc;
+  rcv_luind     = NULL;
+  iword = sizeof(int_t);
+  dword = sizeof(float);
+  memAux = 0.; memRet = 0.;
+
+  mem           = intCalloc_dist(12 * nprocs);
+  if (!mem)
+    return (ERROR_RET);
+  memAux     = (float) (12 * nprocs * sizeof(int_t));
+  nnzToRecv     = mem;
+  nnzToSend     = nnzToRecv + 2*nprocs;
+  nnzToSend_l   = nnzToSend + 2 * nprocs;
+  nnzToSend_u   = nnzToSend_l + nprocs;
+  send_1        = nnzToSend_u + nprocs;
+  send_2        = send_1 + nprocs;
+  tmp_ptrToSend = send_2 + nprocs;
+  nnzToRecv_l   = tmp_ptrToSend + nprocs;
+  nnzToRecv_u   = nnzToRecv_l + nprocs;
+
+  ptrToSend = nnzToSend;
+  ptrToRecv = nnzToSend + nprocs;
+
+  nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
+  intBuf1 = nvtcs + nprocs;
+  intBuf2 = nvtcs + 2 * nprocs;
+  intBuf3 = nvtcs + 3 * nprocs;
+  intBuf4 = nvtcs + 4 * nprocs;
+  memAux += 5 * nprocs * sizeof(int);
+
+  maxszsn   = sp_ienv_dist(3);
+
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(supno_n = intMalloc_dist(n+1)) ) {
+    fprintf (stderr, "Malloc fails for supno_n[].");
+    return (memAux);
+  }
+  memRet += (float) ((n+1) * sizeof(int_t));
+
+  /* ------------------------------------------------------------
+     DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
+     ------------------------------------------------------------*/
+
+  if (nvtcs_loc > INT_MAX)
+    ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
+  intNvtcs_loc = (int) nvtcs_loc;
+  MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
+	      0, grid->comm);
+
+  if (!iam) {
+    /* set ptrToRecv to point to the beginning of the data for
+       each processor */
+    for (k = 0, p = 0; p < nprocs; p++) {
+      ptrToRecv[p] = k;
+      k += nvtcs[p];
+    }
+  }
+
+  if (nprocs > 1) {
+    temp = NULL;
+    if (!iam ) {
+      if ( !(temp = intMalloc_dist (n+1)) ) {
+	fprintf (stderr, "Malloc fails for temp[].");
+	return (memAux + memRet);
+      }
+      memAux += (float) (n+1) * iword;
+    }
+#if defined (_LONGINT)
+    for (p=0; p<nprocs; p++) {
+      if (ptrToRecv[p] > INT_MAX)
+	ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+      intBuf1[p] = (int) ptrToRecv[p];
+    }
+#else  /* Default */
+    intBuf1 = ptrToRecv;
+#endif
+    MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t,
+		 temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
+  }
+  else
+    temp = supno_s;
+
+  if (!iam) {
+    nsupers = 0;
+    p = (int) OWNER( globToLoc[0] );
+    gb = temp[ptrToRecv[p]];
+    supno_n[0] = nsupers;
+    ptrToRecv[p] ++;
+    szsn = 1;
+    for (j = 1; j < n; j ++) {
+      if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
+	nsupers ++;
+	p  = (int) OWNER( globToLoc[j] );
+	gb = temp[ptrToRecv[p]];
+	szsn = 1;
+      }
+      else {
+	szsn ++;
+      }
+      ptrToRecv[p] ++;
+      supno_n[j] = nsupers;
+    }
+    nsupers++;
+    if (nprocs > 1) {
+      SUPERLU_FREE (temp);
+      memAux -= (float) (n+1) * iword;
+    }
+    supno_n[n] = nsupers;
+  }
+
+  /* reset to 0 nnzToSend */
+  for (p = 0; p < 2 *nprocs; p++)
+    nnzToSend[p] = 0;
+
+  MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
+  nsupers = supno_n[n];
+  /* Allocate space for storing Glu_persist_n. */
+  if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
+    fprintf (stderr, "Malloc fails for xsup_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers+1) * iword;
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
+     ------------------------------------------------------------*/
+  gb = EMPTY;
+  for (i = 0; i < n; i++) {
+    if (gb != supno_n[i]) {
+      /* a new supernode starts */
+      gb = supno_n[i];
+      xsup_n[gb] = i;
+    }
+  }
+  xsup_n[nsupers] = n;
+
+  for (p = 0; p < nprocs; p++) {
+    send_1[p] = FALSE;
+    send_2[p] = FALSE;
+  }
+  for (gb_n = 0; gb_n < nsupers; gb_n ++) {
+    i = xsup_n[gb_n];
+    if (iam == (int) OWNER( globToLoc[i] )) {
+      pc = PCOL( gb_n, grid );
+      pr = PROW( gb_n, grid );
+      p_diag = PNUM( pr, pc, grid);
+
+      i_loc = LOCAL_IND( globToLoc[i] );
+      gb_s  = supno_s[i_loc];
+      fst_s = xsup_beg_s[gb_s];
+      lst_s = xsup_end_s[gb_s];
+      fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+      for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
+	k = lsub_s[j];
+	if (k >= i) {
+	  gb = supno_n[k];
+	  p = (int) PNUM( PROW(gb, grid), pc, grid );
+	  nnzToSend[2*p] ++;
+	  send_1[p] = TRUE;
+	}
+      }
+      for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
+	k = usub_s[j];
+	if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
+	  gb = supno_n[k];
+	  p = PNUM( pr, PCOL(gb, grid), grid);
+	  nnzToSend[2*p+1] ++;
+	  send_2[p] = TRUE;
+	}
+      }
+
+      nsend_2 = 0;
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	nnzToSend[2*p+1] += 2;
+	if (send_2[p])  nsend_2 ++;
+      }
+      for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++)
+	if (send_2[p] || p == p_diag) {
+	  if (p == p_diag && !send_2[p])
+	    nnzToSend[2*p+1] += nsend_2;
+	  else
+	    nnzToSend[2*p+1] += nsend_2-1;
+	  send_2[p] = FALSE;
+	}
+      nsend_1 = 0;
+      for (p = pc; p < nprocs; p += grid->npcol) {
+	nnzToSend[2*p] += 2;
+	if (send_1[p]) nsend_1 ++;
+      }
+      for (p = pc; p < nprocs; p += grid->npcol)
+	if (send_1[p]) {
+	  nnzToSend[2*p] += nsend_1-1;
+	  send_1[p] = FALSE;
+	}
+	else
+	  nnzToSend[2*p] += nsend_1;
+    }
+  }
+
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
+		grid->comm);
+
+  nnz_loc_l = nnz_loc_u = 0;
+  SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;
+  for (p = 0; p < nprocs; p++) {
+    if ( p != iam ) {
+      SendCnt_l += nnzToSend[2*p];   nnzToSend_l[p] = nnzToSend[2*p];
+      SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1];
+      RecvCnt_l += nnzToRecv[2*p];   nnzToRecv_l[p] = nnzToRecv[2*p];
+      RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    } else {
+      nnz_loc_l += nnzToRecv[2*p];
+      nnz_loc_u += nnzToRecv[2*p+1];
+      nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
+      nnzToRecv_l[p] = nnzToRecv[2*p];
+      nnzToRecv_u[p] = nnzToRecv[2*p+1];
+    }
+  }
+
+  /* Allocate space for storing the symbolic structure after redistribution. */
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for xlsub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_j+1) * iword;
+
+  if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for xusub_n[].");
+    return (memAux + memRet);
+  }
+  memRet += (float) (nsupers_i+1) * iword;
+
+  /* Allocate temp storage for sending/receiving the L/U symbolic structure. */
+  if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
+    if (!(rcv_luind =
+	  intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
+      fprintf (stderr, "Malloc fails for rcv_luind[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u)
+      * iword;
+  }
+  if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+    if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
+      fprintf (stderr, "Malloc fails for index[].");
+      return (memAux + memRet);
+    }
+    memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+  }
+
+  /* ------------------------------------------------------------------
+     LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
+     ------------------------------------------------------------------*/
+  sendL = TRUE;
+  sendU = FALSE;
+  while (sendL || sendU) {
+    if (sendL) {
+      xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
+      nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
+    }
+    if (sendU) {
+      xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
+      nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
+    }
+    for (i = 0, j = 0, p = 0; p < nprocs; p++) {
+      if ( p != iam ) {
+	ptrToSend[p] = i;  i += nnzToSend[p];
+      }
+      ptrToRecv[p] = j;  j += nnzToRecv[p];
+    }
+    nnzToRecv[iam] = 0;
+
+    ind_loc = ptrToRecv[iam];
+    for (gb_n = 0; gb_n < nsupers; gb_n++) {
+      nsend_2 = 0;
+      i = xsup_n[gb_n];
+      if (iam == OWNER( globToLoc[i] )) {
+	pc = PCOL( gb_n, grid );
+	pr = PROW( gb_n, grid );
+	p_diag = PNUM( pr, pc, grid );
+
+	i_loc = LOCAL_IND( globToLoc[i] );
+	gb_s  = supno_s[i_loc];
+	fst_s = xsup_beg_s[gb_s];
+	lst_s = xsup_end_s[gb_s];
+	fst_s_l = LOCAL_IND( globToLoc[fst_s] );
+
+	if (sendL) {
+	  p = pc;                np = grid->nprow;
+	} else {
+	  p = pr * grid->npcol;  np = grid->npcol;
+	}
+	for (j = 0; j < np; j++) {
+	  if (p == iam) {
+	    rcv_luind[ind_loc] = gb_n;
+	    rcv_luind[ind_loc+1] = 0;
+	    tmp_ptrToSend[p] = ind_loc + 1;
+	    ind_loc += 2;
+	  }
+	  else {
+	    snd_luind[ptrToSend[p]] = gb_n;
+	    snd_luind[ptrToSend[p]+1] = 0;
+	    tmp_ptrToSend[p] = ptrToSend[p] + 1;
+	    ptrToSend[p] += 2;
+	  }
+	  if (sendL) p += grid->npcol;
+	  if (sendU) p++;
+	}
+	for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
+	  k = sub_s[j];
+	  if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
+	    gb = supno_n[k];
+	    if (sendL)
+	      p = PNUM( PROW(gb, grid), pc, grid );
+	    else
+	      p = PNUM( pr, PCOL(gb, grid), grid);
+	    if (send_1[p] == FALSE) {
+	      send_1[p] = TRUE;
+	      send_2[nsend_2] = k; nsend_2 ++;
+	    }
+	    if (p == iam) {
+	      rcv_luind[ind_loc] = k;  ind_loc++;
+	      if (sendL)
+		xsub_n[LBj( gb_n, grid )] ++;
+	      else
+		xsub_n[LBi( gb_n, grid )] ++;
+	    }
+	    else {
+	      snd_luind[ptrToSend[p]] = k;
+	      ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+	    }
+	  }
+	}
+	if (sendL)
+	  for (p = pc; p < nprocs; p += grid->npcol) {
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if (PNUM(PROW(gb, grid), pc, grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBj( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }
+		}
+	      }
+	      send_1[p] = FALSE;
+	  }
+	if (sendU)
+	  for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
+	    if (send_1[p] || p == p_diag) {
+	      for (k = 0; k < nsend_2; k++) {
+		gb = supno_n[send_2[k]];
+		if(PNUM( pr, PCOL(gb, grid), grid) != p) {
+		  if (p == iam) {
+		    rcv_luind[ind_loc] = send_2[k];  ind_loc++;
+		    xsub_n[LBi( gb_n, grid )] ++;
+		  }
+		  else {
+		    snd_luind[ptrToSend[p]] = send_2[k];
+		    ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
+		  }
+		}
+	      }
+	      send_1[p] = FALSE;
+	    }
+	  }
+      }
+    }
+
+    /* reset ptrToSnd to point to the beginning of the data for
+       each processor (structure needed in MPI_Alltoallv) */
+    for (i = 0, p = 0; p < nprocs; p++) {
+      ptrToSend[p] = i;  i += nnzToSend[p];
+    }
+
+    /* ------------------------------------------------------------
+       PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+       Note: it uses MPI_Alltoallv.
+       ------------------------------------------------------------*/
+    if (nprocs > 1) {
+#if defined (_LONGINT)
+      nnzToSend[iam] = 0;
+      for (p=0; p<nprocs; p++) {
+	if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
+	    nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
+	  ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
+	intBuf1[p] = (int) nnzToSend[p];
+	intBuf2[p] = (int) ptrToSend[p];
+	intBuf3[p] = (int) nnzToRecv[p];
+	intBuf4[p] = (int) ptrToRecv[p];
+      }
+#else  /* Default */
+      intBuf1 = nnzToSend;  intBuf2 = ptrToSend;
+      intBuf3 = nnzToRecv;  intBuf4 = ptrToRecv;
+#endif
+
+      MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t,
+		     rcv_luind, intBuf3, intBuf4, mpi_int_t,
+		     grid->comm);
+    }
+    if (sendL)
+      nnzToRecv[iam] = nnz_loc_l;
+    else
+      nnzToRecv[iam] = nnz_loc_u;
+
+    /* ------------------------------------------------------------
+       DEALLOCATE TEMPORARY STORAGE.
+       -------------------------------------------------------------*/
+    if (sendU)
+      if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
+	SUPERLU_FREE (snd_luind);
+	memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
+      }
+
+    /* ------------------------------------------------------------
+       CONVERT THE FORMAT.
+       ------------------------------------------------------------*/
+    /* Initialize the array of column of L/ row of U pointers */
+    k = 0;
+    for (p = 0; p < nprocs; p ++) {
+      if (p != iam) {
+	i = k;
+	while (i < k + nnzToRecv[p]) {
+	  gb = rcv_luind[i];
+	  nelts = rcv_luind[i+1];
+	  if (sendL)
+	    xsub_n[LBj( gb, grid )] = nelts;
+	  else
+	    xsub_n[LBi( gb, grid )] = nelts;
+	  i += nelts + 2;
+	}
+      }
+      k += nnzToRecv[p];
+    }
+
+    if (sendL) j = nsupers_j;
+    else j = nsupers_i;
+    k = 0;
+    isize = xsub_n[0];
+    xsub_n[0] = 0;
+    for (gb_l = 1; gb_l < j; gb_l++) {
+      k += isize;
+      isize = xsub_n[gb_l];
+      xsub_n[gb_l] = k;
+    }
+    xsub_n[gb_l] = k + isize;
+    nnz_loc = xsub_n[gb_l];
+    if (sendL) {
+      lsub_n = NULL;
+      if (nnz_loc) {
+	if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for lsub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = lsub_n;
+    }
+    if (sendU) {
+      usub_n = NULL;
+      if (nnz_loc) {
+	if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
+	  fprintf (stderr, "Malloc fails for usub_n[].");
+	  return (memAux + memRet);
+	}
+	memRet += (float) (nnz_loc * iword);
+      }
+      sub_n = usub_n;
+    }
+
+    /* Copy the data into the L column / U row oriented storage */
+    k = 0;
+    for (p = 0; p < nprocs; p++) {
+      i = k;
+      while (i < k + nnzToRecv[p]) {
+	gb = rcv_luind[i];
+	if (gb >= nsupers)
+	  printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
+		  iam, p, gb, nsupers, i, i-k);
+	i += 2;
+	if (sendL) gb_l = LBj( gb, grid );
+	if (sendU) gb_l = LBi( gb, grid );
+	for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
+	  sub_n[j] = rcv_luind[i];
+	}
+      }
+      k += nnzToRecv[p];
+    }
+    if (sendL) {
+      sendL = FALSE;  sendU = TRUE;
+    }
+    else
+      sendU = FALSE;
+  }
+
+  /* deallocate memory allocated during symbolic factorization routine */
+  if (rcv_luind != NULL) {
+    SUPERLU_FREE (rcv_luind);
+    memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
+  }
+  SUPERLU_FREE (mem);
+  memAux -= (float) (12 * nprocs * iword);
+  SUPERLU_FREE(nvtcs);
+  memAux -= (float) (5 * nprocs * sizeof(int));
+
+  if (xlsub_s != NULL) {
+    SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
+  }
+  if (xusub_s != NULL) {
+    SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
+  }
+  SUPERLU_FREE (globToLoc);
+  if (supno_s != NULL) {
+    SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
+    SUPERLU_FREE (supno_s);
+  }
+
+  Glu_persist->supno = supno_n;  Glu_persist->xsup  = xsup_n;
+  *p_xlsub = xlsub_n; *p_lsub = lsub_n;
+  *p_xusub = xusub_n; *p_usub = usub_n;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit dist_symbLU()");
+#endif
+
+  return (-memRet);
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.  The lower part is
+ *   stored using a column format and the upper part
+ *   is stored using a row format.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_persist  (Input) Glu_persist_t *
+ *        Information on supernodes mapping.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * p_ainf_colptr (Output) int_t**
+ *         Pointer to the lower part of A distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_ainf_rowind (Output) int_t**
+ *         Structure of of the lower part of A distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_ainf_val    (Output) float**
+ *         Numerical values of the lower part of A, distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_asup_rowptr (Output) int_t**
+ *         Pointer to the upper part of A distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_asup_colind (Output) int_t**
+ *         Structure of of the upper part of A distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * p_asup_val    (Output) float**
+ *         Numerical values of the upper part of A, distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * ilsum_i  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block row wise).
+ *
+ * ilsum_j  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block column wise).
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+static float
+sdist_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct,
+	Glu_persist_t *Glu_persist, gridinfo_t *grid,
+	int_t **p_ainf_colptr, int_t **p_ainf_rowind, float **p_ainf_val,
+	int_t **p_asup_rowptr, int_t **p_asup_colind, float **p_asup_val,
+	int_t *ilsum_i, int_t *ilsum_j
+	)
+{
+  int    iam, p, procs;
+  NRformat_loc *Astore;
+  int_t  *perm_r; /* row permutation vector */
+  int_t  *perm_c; /* column permutation vector */
+  int_t  i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
+  int_t  nsupers, nsupers_i, nsupers_j;
+  int_t  nnz_loc, nnz_loc_ainf, nnz_loc_asup;    /* number of local nonzeros */
+  int_t  SendCnt; /* number of remote nonzeros to be sent */
+  int_t  RecvCnt; /* number of remote nonzeros to be sent */
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  float *asup_val, *ainf_val;
+  int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
+  int_t  *ia, *ja, **ia_send, *index, *itemp;
+  int_t  *ptr_to_send;
+  float *aij, **aij_send, *nzval, *dtemp;
+  float *nzval_a;
+  MPI_Request *send_req;
+  MPI_Status  status;
+  int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+  int_t *supno = Glu_persist->supno;
+  float memAux;  /* Memory used during this routine and freed on return */
+  float memRet; /* Memory allocated and not freed on return */
+  int_t iword, dword, szbuf;
+
+  /* ------------------------------------------------------------
+     INITIALIZATION.
+     ------------------------------------------------------------*/
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter sdist_A()");
+#endif
+  iword = sizeof(int_t);
+  dword = sizeof(double);
+
+  perm_r = ScalePermstruct->perm_r;
+  perm_c = ScalePermstruct->perm_c;
+  procs = grid->nprow * grid->npcol;
+  Astore = (NRformat_loc *) A->Store;
+  n = A->ncol;
+  m_loc = Astore->m_loc;
+  fst_row = Astore->fst_row;
+  if (!(nnzToRecv = intCalloc_dist(2*procs))) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (ERROR_RET);
+  }
+  memAux = (float) (2 * procs * iword);
+  memRet = 0.;
+  nnzToSend = nnzToRecv + procs;
+  nsupers  = supno[n-1] + 1;
+
+  /* ------------------------------------------------------------
+     COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
+     THEN ALLOCATE SPACE.
+     THIS ACCOUNTS FOR THE FIRST PASS OF A.
+     ------------------------------------------------------------*/
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+      ++nnzToSend[p];
+    }
+  }
+
+  /* All-to-all communication */
+  MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
+		grid->comm);
+
+  maxnnzToRecv = 0;
+  nnz_loc = SendCnt = RecvCnt = 0;
+
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      SendCnt += nnzToSend[p];
+      RecvCnt += nnzToRecv[p];
+      maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
+    } else {
+      nnz_loc += nnzToRecv[p];
+      /*assert(nnzToSend[p] == nnzToRecv[p]);*/
+    }
+  }
+  k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
+  szbuf = k;
+
+  /* Allocate space for storing the triplets after redistribution. */
+  if ( !(ia = intMalloc_dist(2*k)) ) {
+    fprintf (stderr, "Malloc fails for ia[].");
+    return (memAux);
+  }
+  memAux += (float) (2*k*iword);
+  ja = ia + k;
+  if ( !(aij = floatMalloc_dist(k)) ) {
+    fprintf (stderr, "Malloc fails for aij[].");
+    return (memAux);
+  }
+  memAux += (float) (k*dword);
+
+  /* Allocate temporary storage for sending/receiving the A triplets. */
+  if ( procs > 1 ) {
+    if ( !(send_req = (MPI_Request *)
+	   SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
+      fprintf (stderr, "Malloc fails for send_req[].");
+      return (memAux);
+    }
+    memAux += (float) (2*procs *sizeof(MPI_Request));
+    if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
+      fprintf(stderr, "Malloc fails for ia_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(int_t*));
+    if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) ) {
+      fprintf(stderr, "Malloc fails for aij_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs*sizeof(float*));
+    if ( !(index = intMalloc_dist(2*SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for index[].");
+      return (memAux);
+    }
+    memAux += (float) (2*SendCnt*iword);
+    if ( !(nzval = floatMalloc_dist(SendCnt)) ) {
+      fprintf(stderr, "Malloc fails for nzval[].");
+      return (memAux);
+    }
+    memAux += (float) (SendCnt * dword);
+    if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
+      fprintf(stderr, "Malloc fails for ptr_to_send[].");
+      return (memAux);
+    }
+    memAux += (float) (procs * iword);
+    if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for itemp[].");
+      return (memAux);
+    }
+    memAux += (float) (2*maxnnzToRecv*iword);
+    if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) ) {
+      fprintf(stderr, "Malloc fails for dtemp[].");
+      return (memAux);
+    }
+    memAux += (float) (maxnnzToRecv * dword);
+
+    for (i = 0, j = 0, p = 0; p < procs; ++p) {
+      if ( p != iam ) {
+	ia_send[p] = &index[i];
+	i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
+	aij_send[p] = &nzval[j];
+	j += nnzToSend[p];
+      }
+    }
+  } /* if procs > 1 */
+
+  nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+  nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+  if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *ainf_colptr[].");
+    return (memAux);
+  }
+  memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
+  if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
+    fprintf (stderr, "Malloc fails for *asup_rowptr[].");
+    return (memAux+memRet);
+  }
+  memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
+
+  /* ------------------------------------------------------------
+     LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
+     THIS ACCOUNTS FOR THE SECOND PASS OF A.
+     ------------------------------------------------------------*/
+  nnz_loc = 0; /* Reset the local nonzero count. */
+  nnz_loc_ainf = nnz_loc_asup = 0;
+  nzval_a = Astore->nzval;
+  for (i = 0; i < m_loc; ++i) {
+    for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+      irow = perm_c[perm_r[i+fst_row]];  /* Row number in Pc*Pr*A */
+      jcol = Astore->colind[j];
+      gbi = BlockNum( irow );
+      gbj = BlockNum( jcol );
+      p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
+
+      if ( p != iam ) { /* remote */
+	k = ptr_to_send[p];
+	ia_send[p][k] = irow;
+	ia_send[p][k + nnzToSend[p]] = jcol;
+	aij_send[p][k] = nzval_a[j];
+	++ptr_to_send[p];
+      } else {          /* local */
+	ia[nnz_loc] = irow;
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = nzval_a[j];
+	++nnz_loc;
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+
+  /* ------------------------------------------------------------
+     PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
+     NOTE: Can possibly use MPI_Alltoallv.
+     ------------------------------------------------------------*/
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToSend[p];
+      MPI_Isend( ia_send[p], it, mpi_int_t,
+		 p, iam, grid->comm, &send_req[p] );
+      it = nnzToSend[p];
+      MPI_Isend( aij_send[p], it, MPI_FLOAT,
+		 p, iam+procs, grid->comm, &send_req[procs+p] );
+    }
+  }
+
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      it = 2*nnzToRecv[p];
+      MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
+      it = nnzToRecv[p];
+      MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs,
+		grid->comm, &status );
+      for (i = 0; i < nnzToRecv[p]; ++i) {
+	ia[nnz_loc] = itemp[i];
+	irow = itemp[i];
+	jcol = itemp[i + nnzToRecv[p]];
+	/* assert(jcol<n); */
+	ja[nnz_loc] = jcol;
+	aij[nnz_loc] = dtemp[i];
+	++nnz_loc;
+
+	gbi = BlockNum( irow );
+	gbj = BlockNum( jcol );
+	/* Count nonzeros in each column of L / row of U */
+	if (gbi >= gbj) {
+	  ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
+	  nnz_loc_ainf ++;
+	}
+	else {
+	  asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
+	  nnz_loc_asup ++;
+	}
+      }
+    }
+  }
+
+  for (p = 0; p < procs; ++p) {
+    if ( p != iam ) {
+      MPI_Wait( &send_req[p], &status);
+      MPI_Wait( &send_req[procs+p], &status);
+    }
+  }
+
+  /* ------------------------------------------------------------
+     DEALLOCATE TEMPORARY STORAGE
+     ------------------------------------------------------------*/
+
+  SUPERLU_FREE(nnzToRecv);
+  memAux -= 2 * procs * iword;
+  if ( procs > 1 ) {
+    SUPERLU_FREE(send_req);
+    SUPERLU_FREE(ia_send);
+    SUPERLU_FREE(aij_send);
+    SUPERLU_FREE(index);
+    SUPERLU_FREE(nzval);
+    SUPERLU_FREE(ptr_to_send);
+    SUPERLU_FREE(itemp);
+    SUPERLU_FREE(dtemp);
+    memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
+      procs*sizeof(float*) + 2*SendCnt * iword +
+      SendCnt* dword + procs*iword +
+      2*maxnnzToRecv*iword + maxnnzToRecv*dword;
+  }
+
+  /* ------------------------------------------------------------
+     CONVERT THE TRIPLET FORMAT.
+     ------------------------------------------------------------*/
+  if (nnz_loc_ainf != 0) {
+    if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_rowind[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * iword);
+    if ( !(ainf_val = floatMalloc_dist(nnz_loc_ainf)) ) {
+      fprintf (stderr, "Malloc fails for *ainf_val[].");
+      return (memAux+memRet);
+    }
+    memRet += (float) (nnz_loc_ainf * dword);
+  }
+  else {
+    ainf_rowind = NULL;
+    ainf_val = NULL;
+  }
+  if (nnz_loc_asup != 0) {
+    if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_colind[].");
+      return (memAux + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * iword);
+    if ( !(asup_val = floatMalloc_dist(nnz_loc_asup)) ) {
+      fprintf (stderr, "Malloc fails for *asup_val[].");
+      return (memAux  + memRet);
+    }
+    memRet += (float) (nnz_loc_asup * dword);
+  }
+  else {
+    asup_colind = NULL;
+    asup_val = NULL;
+  }
+
+  /* Initialize the array of column pointers */
+  k = 0;
+  jsize = ainf_colptr[0];  ainf_colptr[0] = 0;
+  for (j = 1; j < ilsum_j[nsupers_j]; j++) {
+    k += jsize;
+    jsize = ainf_colptr[j];
+    ainf_colptr[j] = k;
+  }
+  ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
+  i = 0;
+  isize = asup_rowptr[0];  asup_rowptr[0] = 0;
+  for (j = 1; j < ilsum_i[nsupers_i]; j++) {
+    i += isize;
+    isize = asup_rowptr[j];
+    asup_rowptr[j] = i;
+  }
+  asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
+
+  /* Copy the triplets into the column oriented storage */
+  for (i = 0; i < nnz_loc; ++i) {
+    jcol = ja[i];
+    irow = ia[i];
+    gbi = BlockNum( irow );
+    gbj = BlockNum( jcol );
+    /* Count nonzeros in each column of L / row of U */
+    if (gbi >= gbj) {
+      j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
+      k = ainf_colptr[j];
+      ainf_rowind[k] = irow;
+      ainf_val[k] = aij[i];
+      ainf_colptr[j] ++;
+    }
+    else {
+      j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
+      k = asup_rowptr[j];
+      asup_colind[k] = jcol;
+      asup_val[k] = aij[i];
+      asup_rowptr[j] ++;
+    }
+  }
+
+  /* Reset the column pointers to the beginning of each column */
+  for (j = ilsum_j[nsupers_j]; j > 0; j--)
+    ainf_colptr[j] = ainf_colptr[j-1];
+  for (j = ilsum_i[nsupers_i]; j > 0; j--)
+    asup_rowptr[j] = asup_rowptr[j-1];
+  ainf_colptr[0] = 0;
+  asup_rowptr[0] = 0;
+
+  SUPERLU_FREE(ia);
+  SUPERLU_FREE(aij);
+  memAux -= 2*szbuf*iword + szbuf*dword;
+
+  *p_ainf_colptr = ainf_colptr;
+  *p_ainf_rowind = ainf_rowind;
+  *p_ainf_val    = ainf_val;
+  *p_asup_rowptr = asup_rowptr;
+  *p_asup_colind = asup_colind;
+  *p_asup_val    = asup_val;
+
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Exit sdist_A()");
+  fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
+#endif
+
+  return (-memRet);
+} /* dist_A */
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the input matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *          This routine should not be called for this case, an error
+ *          is generated.  Instead, pddistribute routine should be called.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (Input) int
+ *        Dimension of the matrix.
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (Input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * LUstruct (Input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated for performing the distribution
+ *       of the data, when out of memory.
+ *        (an approximation).
+ * </pre>
+ */
+
+float
+sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
+		sScalePermstruct_t *ScalePermstruct,
+		Pslu_freeable_t *Pslu_freeable,
+		sLUstruct_t *LUstruct, gridinfo_t *grid)
+{
+  Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+  Glu_freeable_t Glu_freeable_n;
+  sLocalLU_t *Llu = LUstruct->Llu;
+  int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1,
+    len, len1, nsupc, nsupc_gb, ii, nprocs;
+  int_t lib;  /* local block row number */
+  int_t nlb;  /* local block rows*/
+  int_t ljb;  /* local block column number */
+  int_t nrbl; /* number of L blocks in current block column */
+  int_t nrbu; /* number of U blocks in current block column */
+  int_t gb;   /* global block number; 0 < gb <= nsuper */
+  int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+  int_t ub,gik,iklrow,fnz;
+  int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
+  int_t mybufmax[NBUFFERS];
+  NRformat_loc *Astore;
+  float *a;
+  int_t *asub, *xa;
+  int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
+  float *asup_val, *ainf_val;
+  int_t *xsup, *supno;    /* supernode and column mapping */
+  int_t *lsub, *xlsub, *usub, *usub1, *xusub;
+  int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
+  int_t next_ind;      /* next available position in index[*] */
+  int_t next_val;      /* next available position in nzval[*] */
+  int_t *index;        /* indices consist of headers and row subscripts */
+  int   *index1;       /* temporary pointer to array of int */
+  float *lusup, *uval; /* nonzero values in L and U */
+  int_t *recvBuf;
+  int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
+  float **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+  float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+  float **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+  int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+  int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+  int_t *index_srt;         /* indices consist of headers and row subscripts */
+  float *lusup_srt; /* nonzero values in L and U */
+  float **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+  int_t  *Unnz;  /* size ceil(NSUPERS/Pc) */
+
+  BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+  RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+  BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+  int msgsize;
+
+  int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+  Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+  int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+
+  /*-- Counts to be used in factorization. --*/
+  int  *ToRecv, *ToSendD, **ToSendR;
+
+  /*-- Counts to be used in lower triangular solve. --*/
+  int_t  *fmod;          /* Modification count for L-solve.        */
+  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nfsendx = 0;    /* Number of Xk I will send               */
+  int_t  kseen;
+
+  /*-- Counts to be used in upper triangular solve. --*/
+  int_t  *bmod;          /* Modification count for U-solve.        */
+  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int_t  *ilsum;         /* starting position of each supernode in
+			    the full array (local)                 */
+  int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
+				the full array (local, block column wise) */
+  /*-- Auxiliary arrays; freed on return --*/
+  int_t *Urb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  int_t *LUb_length; /* L,U block length; size nsupers_ij */
+  int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
+  int_t *LUb_number; /* global block number; size nsupers_ij */
+  int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc)      */
+  int_t *Lrb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+  int_t *ActiveFlag;
+  int_t *ActiveFlagAll;
+  int_t Iactive;
+  int *ranks;
+  int_t *idxs;
+  int_t **nzrows;
+  double rseed;
+  int rank_cnt,rank_cnt_ref,Root;
+float *dense, *dense_col; /* SPA */
+  float zero = 0.0;
+  int_t ldaspa;     /* LDA of SPA */
+  int_t iword, dword;
+  float mem_use = 0.0;
+  int_t *mod_bit;
+  int_t *frecv, *brecv, *lloc;
+  double *SeedSTD_BC,*SeedSTD_RD;
+  int_t idx_indx,idx_lusup;
+  int_t nbrow;
+  int_t  ik, il, lk, rel, knsupc, idx_r;
+  int_t  lptr1_tmp, idx_i, idx_v,m, uu;
+  int_t	nub;
+
+  float memStrLU, memA,
+        memDist = 0.; /* memory used for redistributing the data, which does
+		         not include the memory for the numerical values
+                         of L and U (positive number)*/
+  float  memNLU = 0.; /* memory allocated for storing the numerical values of
+		         L and U, that will be used in the numeric
+                         factorization (positive number) */
+  float  memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
+
+#if ( PRNTlevel>=1 )
+  int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 )
+	double t, t_u, t_l;
+	int_t u_blks;
+#endif
+
+  /* Initialization. */
+  iam = grid->iam;
+#if ( DEBUGlevel>=1 )
+  CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+#endif
+  myrow = MYROW( iam, grid );
+  mycol = MYCOL( iam, grid );
+  nprocs = grid->npcol * grid->nprow;
+  for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+  Astore   = (NRformat_loc *) A->Store;
+
+  iword = sizeof(int_t);
+  dword = sizeof(float);
+
+  if (fact == SamePattern_SameRowPerm) {
+    ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");
+  }
+
+  if ((memStrLU =
+       dist_symbLU (n, Pslu_freeable,
+		    Glu_persist, &xlsub, &lsub, &xusub, &usub,	grid)) > 0)
+    return (memStrLU);
+  memDist += (-memStrLU);
+  xsup  = Glu_persist->xsup;    /* supernode and column mapping */
+  supno = Glu_persist->supno;
+  nsupers  = supno[n-1] + 1;
+  nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
+  nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
+  nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
+  if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memNLU += (nsupers_i+1) * iword;
+  if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
+    fprintf (stderr, "Malloc fails for ilsum_j[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memDist += (nsupers_j+1) * iword;
+
+  /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
+  ilsum[0] = 0;
+  ldaspa = 0;
+  for (gb = 0; gb < nsupers; gb++)
+    if ( myrow == PROW( gb, grid ) ) {
+      i = SuperSize( gb );
+      ldaspa += i;
+      lb = LBi( gb, grid );
+      ilsum[lb + 1] = ilsum[lb] + i;
+    }
+  ilsum[nsupers_i] = ldaspa;
+
+  ldaspa_j = 0; ilsum_j[0] = 0;
+  for (gb = 0; gb < nsupers; gb++)
+    if (mycol == PCOL( gb, grid )) {
+      i = SuperSize( gb );
+      ldaspa_j += i;
+      lb = LBj( gb, grid );
+      ilsum_j[lb + 1] = ilsum_j[lb] + i;
+    }
+  ilsum_j[nsupers_j] = ldaspa_j;
+
+  if ((memA = sdist_A(A, ScalePermstruct, Glu_persist,
+		      grid, &ainf_colptr, &ainf_rowind, &ainf_val,
+		      &asup_rowptr, &asup_colind, &asup_val,
+		      ilsum, ilsum_j)) > 0)
+    return (memDist + memA + memNLU + memTRS);
+  memDist += (-memA);
+
+  /* ------------------------------------------------------------
+     FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+     ------------------------------------------------------------*/
+
+  /* We first need to set up the L and U data structures and then
+   * propagate the values of A into them.
+   */
+  if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) {
+    fprintf(stderr, "Calloc fails for ToRecv[].");
+    return (memDist + memNLU + memTRS);
+  }
+  for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+  memNLU += nsupers * iword;
+
+  k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+  if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) {
+    fprintf(stderr, "Malloc fails for ToSendR[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memNLU += k*sizeof(int_t*);
+  j = k * grid->npcol;
+  if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for index[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memNLU += j*iword;
+
+  for (i = 0; i < j; ++i) index1[i] = EMPTY;
+  for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+
+  /* Auxiliary arrays used to set up L and U block data structures.
+     They are freed on return. */
+  if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_length[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Malloc fails for LUb_indptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_number[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
+    fprintf(stderr, "Calloc fails for LUb_valptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memDist += 4 * nsupers_ij * iword;
+
+  k = CEILING( nsupers, grid->nprow );
+  /* Pointers to the beginning of each block row of U. */
+  if ( !(Unzval_br_ptr =
+	 (float**)SUPERLU_MALLOC(nsupers_i * sizeof(float*))) ) {
+    fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memNLU += nsupers_i*sizeof(float*) + nsupers_i*sizeof(int_t*);
+  Unzval_br_ptr[nsupers_i-1] = NULL;
+  Ufstnz_br_ptr[nsupers_i-1] = NULL;
+
+  if ( !(ToSendD = SUPERLU_MALLOC(nsupers_i * sizeof(int))) ) {
+    fprintf(stderr, "Malloc fails for ToSendD[].");
+    return (memDist + memNLU + memTRS);
+  }
+  for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO;
+
+  memNLU += nsupers_i*iword;
+  if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
+    fprintf(stderr, "Calloc fails for rb_marker[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memDist += (nsupers_i + nsupers_j)*iword;
+
+  /* Auxiliary arrays used to set up L, U block data structures.
+     They are freed on return.
+     k is the number of local row blocks.   */
+  if ( !(dense = floatCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
+				   * sp_ienv_dist(3))) ) {
+    fprintf(stderr, "Calloc fails for SPA dense[].");
+    return (memDist + memNLU + memTRS);
+  }
+  /* These counts will be used for triangular solves. */
+  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for fmod[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+    fprintf(stderr, "Calloc fails for bmod[].");
+    return (memDist + memNLU + memTRS);
+  }
+  /* ------------------------------------------------ */
+  memNLU += 2*nsupers_i*iword +
+    SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword;
+
+  /* Pointers to the beginning of each block column of L. */
+  if ( !(Lnzval_bc_ptr =
+	 (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) {
+    fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+
+  if ( !(Linv_bc_ptr =
+			(float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) {
+	fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
+	return (memDist + memNLU + memTRS);
+  }
+  if ( !(Uinv_bc_ptr =
+			(float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) {
+	fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
+	return (memDist + memNLU + memTRS);
+  }
+  if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
+    fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
+    return (memDist + memNLU + memTRS);
+  }
+
+  if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
+    fprintf(stderr, "Malloc fails for Unnz[].");
+    return (memDist + memNLU + memTRS);
+  }
+  memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+  memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
+  Lnzval_bc_ptr[nsupers_j-1] = NULL;
+  Lrowind_bc_ptr[nsupers_j-1] = NULL;
+  Linv_bc_ptr[nsupers_j-1] = NULL;
+  Uinv_bc_ptr[nsupers_j-1] = NULL;
+  Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
+
+  /* These lists of processes will be used for triangular solves. */
+  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[].");
+    return (memDist + memNLU + memTRS);
+  }
+  len = nsupers_j * grid->nprow;
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for fsendx_plist[0]");
+    return (memDist + memNLU + memTRS);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    fsendx_plist[i] = &index[j];
+  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(index = intMalloc_dist(len)) ) {
+    fprintf(stderr, "Malloc fails for bsendx_plist[0]");
+    return (memDist + memNLU + memTRS);
+  }
+  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
+    bsendx_plist[i] = &index[j];
+  /* -------------------------------------------------------------- */
+  memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+
+  /*------------------------------------------------------------
+    PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+    THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+    ------------------------------------------------------------*/
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid);  /* Local block number row wise */
+    fsupc = FstBlockC( jb );
+    nsupc = SuperSize( jb );
+
+    if ( myrow == jbrow ) { /* Block row jb in my process row */
+      /* Scatter A into SPA. */
+      for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
+	for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
+	  if (i >= asup_rowptr[ilsum[nsupers_i]])
+	    printf ("ERR7\n");
+	  jcol = asup_colind[i];
+	  if (jcol >= n)
+	    printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  gb = BlockNum( jcol );
+	  lb = LBj( gb, grid );
+	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
+	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
+	  if (jcol >= ldaspa_j)
+	    printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
+		    iam, jb, gb, j, jcol);
+	  dense_col[jcol] = asup_val[i];
+	}
+	dense_col += ldaspa_j;
+      }
+
+      /*------------------------------------------------
+       * SET UP U BLOCKS.
+       *------------------------------------------------*/
+      /* Count number of blocks and length of each block. */
+      nrbu = 0;
+      len = 0; /* Number of column subscripts I own. */
+      len1 = 0; /* number of fstnz subscripts */
+      for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	if (i >= xusub[nsupers_i]) printf ("ERR10\n");
+	jcol = usub[i];
+	gb = BlockNum( jcol ); /* Global block number */
+
+	/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
+	  printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
+	  iam, jb, gb, jcol, jbcol, pc); */
+
+	lb = LBj( gb, grid );  /* Local block number */
+	pc = PCOL( gb, grid ); /* Process col owning this block */
+	if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
+	/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
+	pr = PROW( gb, grid );
+	if ( pr != jbrow  && mycol == pc)
+	  bsendx_plist[lb][jbrow] = YES;
+	if (mycol == pc) {
+	  len += nsupc;
+	  LUb_length[lb] += nsupc;
+	  ToSendD[ljb_i] = YES;
+	  if (Urb_marker[lb] <= jb) { /* First see this block */
+	    if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
+	    Urb_marker[lb] = jb + 1;
+	    LUb_number[nrbu] = gb;
+	    /* if (gb == 391825 && jb == 145361)
+	       printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
+	       iam, jb, gb, nrbu); */
+	    nrbu ++;
+	    len1 += SuperSize( gb );
+	    if ( gb != jb )/* Exclude diagonal block. */
+	      ++bmod[ljb_i];/* Mod. count for back solve */
+#if ( PRNTlevel>=1 )
+	    ++nUblocks;
+#endif
+	  }
+	}
+      } /* for i ... */
+
+      if ( nrbu ) {
+	/* Sort the blocks of U in increasing block column index.
+	   SuperLU_DIST assumes this is true */
+	/* simple insert sort algorithm */
+	/* to be transformed in quick sort */
+	for (j = 1; j < nrbu; j++) {
+	  k = LUb_number[j];
+	  for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
+	    LUb_number[i+1] = LUb_number[i];
+	  }
+	  LUb_number[i+1] = k;
+	}
+
+	/* Set up the initial pointers for each block in
+	   index[] and nzval[]. */
+	/* Add room for descriptors */
+	len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1+1)) ) {
+	  fprintf (stderr, "Malloc fails for Uindex[]");
+	  return (memDist + memNLU + memTRS);
+	}
+	Ufstnz_br_ptr[ljb_i] = index;
+	if (!(Unzval_br_ptr[ljb_i] =
+	      floatMalloc_dist(len))) {
+	  fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
+	  return (memDist + memNLU + memTRS);
+	}
+	memNLU += (len1+1)*iword + len*dword;
+	uval = Unzval_br_ptr[ljb_i];
+	mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+	mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+	index[0] = nrbu;  /* Number of column blocks */
+	index[1] = len;   /* Total length of nzval[] */
+	index[2] = len1;  /* Total length of index */
+	index[len1] = -1; /* End marker */
+	next_ind = BR_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbu; k++) {
+	  gb = LUb_number[k];
+	  lb = LBj( gb, grid );
+	  len = LUb_length[lb];
+	  LUb_length[lb] = 0;  /* Reset vector of block length */
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len;
+	  LUb_indptr[lb] = next_ind;
+	  for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
+	    index[next_ind] = FstBlockC( jb + 1 );
+	  LUb_valptr[lb] = next_val;
+	  next_val += len;
+	}
+	/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
+	   and the initial values of A from SPA into Unzval_br_ptr[]. */
+	for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
+	  jcol = usub[i];
+	  gb = BlockNum( jcol );
+
+	  if ( mycol == PCOL( gb, grid ) ) {
+	    lb = LBj( gb, grid );
+	    k = LUb_indptr[lb]; /* Start fstnz in index */
+	    index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
+	  }
+	}  /* for i ... */
+
+	for (i = 0; i < nrbu; i++) {
+	  gb = LUb_number[i];
+	  lb = LBj( gb, grid );
+	  next_ind = LUb_indptr[lb];
+	  k = FstBlockC( jb + 1);
+	  jcol = ilsum_j[lb];
+	  for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
+	    dense_col = dense;
+	    j = index[next_ind+jj];
+	    for (ii = j; ii < k; ii++) {
+	      uval[LUb_valptr[lb]++] = dense_col[jcol];
+	      dense_col[jcol] = zero;
+	      dense_col += ldaspa_j;
+	    }
+	  }
+	}
+      } else {
+	Ufstnz_br_ptr[ljb_i] = NULL;
+	Unzval_br_ptr[ljb_i] = NULL;
+      } /* if nrbu ... */
+    } /* if myrow == jbrow */
+
+      /*------------------------------------------------
+       * SET UP L BLOCKS.
+       *------------------------------------------------*/
+    if (mycol == jbcol) {  /* Block column jb in my process column */
+      /* Scatter A_inf into SPA. */
+      for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
+	for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
+	  irow = ainf_rowind[i];
+	  if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
+	  gb = BlockNum( irow );
+	  if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
+	  if ( myrow == PROW( gb, grid ) ) {
+	    lb = LBi( gb, grid );
+	    irow = ilsum[lb] + irow - FstBlockC( gb );
+	    if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
+	    dense_col[irow] = ainf_val[i];
+	  }
+	}
+	dense_col += ldaspa;
+      }
+
+      /* sort the indices of the diagonal block at the beginning of xlsub */
+      if (myrow == jbrow) {
+	k = xlsub[ljb_j];
+	for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	  irow = lsub[i];
+	  if (irow < nsupc + fsupc && i != k+irow-fsupc) {
+	    lsub[i] = lsub[k + irow - fsupc];
+	    lsub[k + irow - fsupc] = irow;
+	    i --;
+	  }
+	}
+      }
+
+      /* Count number of blocks and length of each block. */
+      nrbl = 0;
+      len = 0; /* Number of row subscripts I own. */
+      kseen = 0;
+      for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	irow = lsub[i];
+	gb = BlockNum( irow ); /* Global block number */
+	pr = PROW( gb, grid ); /* Process row owning this block */
+	if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
+	     myrow == jbrow) {
+	  fsendx_plist[ljb_j][pr] = YES;
+	  ++nfsendx;
+	}
+	if ( myrow == pr ) {
+	  lb = LBi( gb, grid );  /* Local block number */
+	  if (Lrb_marker[lb] <= jb) { /* First see this block */
+	    Lrb_marker[lb] = jb + 1;
+	    LUb_length[lb] = 1;
+	    LUb_number[nrbl++] = gb;
+	    if ( gb != jb ) /* Exclude diagonal block. */
+	      ++fmod[lb]; /* Mod. count for forward solve */
+	    if ( kseen == 0 && myrow != jbrow ) {
+	      ++nfrecvx;
+	      kseen = 1;
+	    }
+#if ( PRNTlevel>=1 )
+	    ++nLblocks;
+#endif
+	  } else
+	    ++LUb_length[lb];
+	  ++len;
+	}
+      } /* for i ... */
+
+      if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+	/* Set up the initial pointers for each block in
+	   index[] and nzval[]. */
+	/* If I am the owner of the diagonal block, order it first in LUb_number.
+	   Necessary for SuperLU_DIST routines */
+	kseen = EMPTY;
+	for (j = 0; j < nrbl; j++) {
+	  if (LUb_number[j] == jb)
+	    kseen = j;
+	}
+	if (kseen != EMPTY && kseen != 0) {
+	  LUb_number[kseen] = LUb_number[0];
+	  LUb_number[0] = jb;
+	}
+
+	/* Add room for descriptors */
+	len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+	if ( !(index = intMalloc_dist(len1)) ) {
+	  fprintf (stderr, "Malloc fails for index[]");
+	  return (memDist + memNLU + memTRS);
+	}
+	Lrowind_bc_ptr[ljb_j] = index;
+	if (!(Lnzval_bc_ptr[ljb_j] =
+	      floatMalloc_dist(len*nsupc))) {
+	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+	  return (memDist + memNLU + memTRS);
+	}
+
+	if (!(Linv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+		ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
+	if (!(Uinv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+		ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+
+	memNLU += len1*iword + len*nsupc*dword;
+
+	if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
+		ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
+	memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+
+	lusup = Lnzval_bc_ptr[ljb_j];
+	mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+	mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+	mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+	index[0] = nrbl;  /* Number of row blocks */
+	index[1] = len;   /* LDA of the nzval[] */
+	next_ind = BC_HEADER;
+	next_val = 0;
+	for (k = 0; k < nrbl; ++k) {
+	  gb = LUb_number[k];
+	  lb = LBi( gb, grid );
+	  len = LUb_length[lb];
+
+	  Lindval_loc_bc_ptr[ljb_j][k] = lb;
+	  Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind;
+	  Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val;
+
+	  LUb_length[lb] = 0;
+	  index[next_ind++] = gb; /* Descriptor */
+	  index[next_ind++] = len;
+	  LUb_indptr[lb] = next_ind;
+	    LUb_valptr[lb] = next_val;
+	    next_ind += len;
+	    next_val += len;
+	  }
+	  /* Propagate the compressed row subscripts to Lindex[],
+	     and the initial values of A from SPA into Lnzval[]. */
+	  len = index[1];  /* LDA of lusup[] */
+	  for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
+	    irow = lsub[i];
+	    gb = BlockNum( irow );
+	    if ( myrow == PROW( gb, grid ) ) {
+	      lb = LBi( gb, grid );
+	      k = LUb_indptr[lb]++; /* Random access a block */
+	      index[k] = irow;
+	      k = LUb_valptr[lb]++;
+	      irow = ilsum[lb] + irow - FstBlockC( gb );
+	      for (j = 0, dense_col = dense; j < nsupc; ++j) {
+		lusup[k] = dense_col[irow];
+		dense_col[irow] = zero;
+		k += len;
+		dense_col += ldaspa;
+	      }
+	    }
+	  } /* for i ... */
+
+
+
+		/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
+		if(nrbl>1){
+			krow = PROW( jb, grid );
+			if(myrow==krow){ /* skip the diagonal block */
+				uu=nrbl-2;
+				lloc = &Lindval_loc_bc_ptr[ljb_j][1];
+			}else{
+				uu=nrbl-1;
+				lloc = Lindval_loc_bc_ptr[ljb_j];
+			}
+			quickSortM(lloc,0,uu,nrbl,0,3);
+		}
+
+
+		if ( !(index_srt = intMalloc_dist(len1)) )
+			ABORT("Malloc fails for index_srt[]");
+		if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+			ABORT("Malloc fails for lusup_srt[]");
+
+		idx_indx = BC_HEADER;
+		idx_lusup = 0;
+		for (jj=0;jj<BC_HEADER;jj++)
+			index_srt[jj] = index[jj];
+
+		for(i=0;i<nrbl;i++){
+			nbrow = index[Lindval_loc_bc_ptr[ljb_j][i+nrbl]+1];
+			for (jj=0;jj<LB_DESCRIPTOR+nbrow;jj++){
+				index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb_j][i+nrbl]+jj];
+			}
+
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
+
+			for (jj=0;jj<nbrow;jj++){
+				k=idx_lusup;
+				k1=Lindval_loc_bc_ptr[ljb_j][i+nrbl*2]+jj;
+				for (j = 0; j < nsupc; ++j) {
+					lusup_srt[k] = lusup[k1];
+					k += len;
+					k1 += len;
+				}
+				idx_lusup++;
+			}
+			Lindval_loc_bc_ptr[ljb_j][i+nrbl*2] = idx_lusup - nbrow;
+		}
+
+		SUPERLU_FREE(lusup);
+		SUPERLU_FREE(index);
+
+		Lrowind_bc_ptr[ljb_j] = index_srt;
+		Lnzval_bc_ptr[ljb_j] = lusup_srt;
+	} else {
+	  Lrowind_bc_ptr[ljb_j] = NULL;
+	  Lnzval_bc_ptr[ljb_j] = NULL;
+	  Linv_bc_ptr[ljb_j] = NULL;
+	  Uinv_bc_ptr[ljb_j] = NULL;
+	  Lindval_loc_bc_ptr[ljb_j] = NULL;
+	} /* if nrbl ... */
+      } /* if mycol == pc */
+  } /* for jb ... */
+
+  SUPERLU_FREE(ilsum_j);
+  SUPERLU_FREE(Urb_marker);
+  SUPERLU_FREE(LUb_length);
+  SUPERLU_FREE(LUb_indptr);
+  SUPERLU_FREE(LUb_number);
+  SUPERLU_FREE(LUb_valptr);
+  SUPERLU_FREE(Lrb_marker);
+  SUPERLU_FREE(dense);
+
+  /* Free the memory used for storing A */
+  SUPERLU_FREE(ainf_colptr);
+  if (ainf_rowind != NULL) {
+    SUPERLU_FREE(ainf_rowind);
+    SUPERLU_FREE(ainf_val);
+  }
+  SUPERLU_FREE(asup_rowptr);
+  if (asup_colind != NULL) {
+    SUPERLU_FREE(asup_colind);
+    SUPERLU_FREE(asup_val);
+  }
+
+  /* exchange information about bsendx_plist in between column of processors */
+  k = SUPERLU_MAX( grid->nprow, grid->npcol);
+  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+    fprintf (stderr, "Malloc fails for recvBuf[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for nnzToRecv[].");
+    return (memDist + memNLU + memTRS);
+  }
+  if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
+    fprintf (stderr, "Malloc fails for ptrToRecv[].");
+    return (memDist + memNLU + memTRS);
+  }
+
+  if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
+    memDist = nsupers*k*iword +4*nprocs * sizeof(int);
+
+  for (p = 0; p < nprocs; p++)
+    nnzToRecv[p] = 0;
+
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    nnzToRecv[p] += grid->npcol;
+  }
+  i = 0;
+  for (p = 0; p < nprocs; p++) {
+    ptrToRecv[p] = i;
+    i += nnzToRecv[p];
+    ptrToSend[p] = 0;
+    if (p != iam)
+      nnzToSend[p] = nnzToRecv[iam];
+    else
+      nnzToSend[p] = 0;
+  }
+  nnzToRecv[iam] = 0;
+  i = ptrToRecv[iam];
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    if (p == iam) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
+      for (j = 0; j < grid->npcol; j++, i++)
+	recvBuf[i] = ToSendR[ljb_j][j];
+    }
+  }
+
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+
+  for (jb = 0; jb < nsupers; jb++) {
+    jbcol = PCOL( jb, grid );
+    jbrow = PROW( jb, grid );
+    p = PNUM(jbrow, jbcol, grid);
+    ljb_j = LBj( jb, grid ); /* Local block number column wise */
+    ljb_i = LBi( jb, grid ); /* Local block number row wise */
+    /* (myrow == jbrow) {
+       if (ToSendD[ljb_i] == YES)
+       ToRecv[jb] = 1;
+       }
+       else {
+       if (recvBuf[ptrToRecv[p] + mycol] == YES)
+       ToRecv[jb] = 2;
+       } */
+    if (recvBuf[ptrToRecv[p] + mycol] == YES) {
+      if (myrow == jbrow)
+	ToRecv[jb] = 1;
+      else
+	ToRecv[jb] = 2;
+    }
+    if (mycol == jbcol) {
+      for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++)
+	ToSendR[ljb_j][i] = recvBuf[j];
+      ToSendR[ljb_j][mycol] = EMPTY;
+    }
+    ptrToRecv[p] += grid->npcol;
+  }
+
+  /* exchange information about bsendx_plist in between column of processors */
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+		 MPI_MAX, grid->cscp.comm);
+
+  for (jb = 0; jb < nsupers; jb ++) {
+    jbcol = PCOL( jb, grid);
+    jbrow = PROW( jb, grid);
+    if (mycol == jbcol) {
+      ljb_j = LBj( jb, grid ); /* Local block number column wise */
+      if (myrow == jbrow ) {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
+	  (*bsendx_plist)[k] = recvBuf[k];
+	  if ((*bsendx_plist)[k] != EMPTY)
+	    nbsendx ++;
+	}
+      }
+      else {
+	for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++)
+	  (*bsendx_plist)[k] = EMPTY;
+      }
+    }
+  }
+
+		/////////////////////////////////////////////////////////////////
+
+		/* Set up additional pointers for the index and value arrays of U.
+		   nub is the number of local block columns. */
+		nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
+		if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+			ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+								 blocks in a block column. */
+		Urbs1 = Urbs + nub;
+		if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+			ABORT("Malloc fails for Ucb_indptr[]");
+		if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+			ABORT("Malloc fails for Ucb_valptr[]");
+		nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
+
+		/* Count number of row blocks in a block column.
+		   One pass of the skeleton graph of U. */
+		for (lk = 0; lk < nlb; ++lk) {
+			usub1 = Ufstnz_br_ptr[lk];
+			if ( usub1 ) { /* Not an empty block row. */
+				/* usub1[0] -- number of column blocks in this block row. */
+				i = BR_HEADER; /* Pointer in index array. */
+				for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+					k = usub1[i];            /* Global block number */
+					++Urbs[LBj(k,grid)];
+					i += UB_DESCRIPTOR + SuperSize( k );
+				}
+			}
+		}
+
+		/* Set up the vertical linked lists for the row blocks.
+		   One pass of the skeleton graph of U. */
+		for (lb = 0; lb < nub; ++lb) {
+			if ( Urbs[lb] ) { /* Not an empty block column. */
+				if ( !(Ucb_indptr[lb]
+							= SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+					ABORT("Malloc fails for Ucb_indptr[lb][]");
+				if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+					ABORT("Malloc fails for Ucb_valptr[lb][]");
+			}
+		}
+		for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+			usub1 = Ufstnz_br_ptr[lk];
+			if ( usub1 ) { /* Not an empty block row. */
+				i = BR_HEADER; /* Pointer in index array. */
+				j = 0;         /* Pointer in nzval array. */
+
+				for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+					k = usub1[i];          /* Global block number, column-wise. */
+					ljb = LBj( k, grid ); /* Local block number, column-wise. */
+					Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+
+					Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+					Ucb_valptr[ljb][Urbs1[ljb]] = j;
+
+					++Urbs1[ljb];
+					j += usub1[i+1];
+					i += UB_DESCRIPTOR + SuperSize( k );
+				}
+			}
+		}
+
+
+
+/* Count the nnzs per block column */
+	for (lb = 0; lb < nub; ++lb) {
+		Unnz[lb] = 0;
+		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
+		knsupc = SuperSize( k );
+		for (ub = 0; ub < Urbs[lb]; ++ub) {
+			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
+			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iklrow = FstBlockC( gik+1 );
+			for (jj = 0; jj < knsupc; ++jj) {
+				fnz = Ufstnz_br_ptr[ik][i + jj];
+				if ( fnz < iklrow ) {
+					Unnz[lb] +=iklrow-fnz;
+				}
+			} /* for jj ... */
+		}
+	}
+
+		/////////////////////////////////////////////////////////////////
+
+		// if(LSUM<nsupers)ABORT("Need increase LSUM."); /* temporary*/
+
+#if ( PROFlevel>=1 )
+			t = SuperLU_timer_();
+#endif
+		/* construct the Bcast tree for L ... */
+
+		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+		if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+			ABORT("Malloc fails for LBtree_ptr[].");
+		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+			ABORT("Calloc fails for ActiveFlag[].");
+		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+			ABORT("Malloc fails for ranks[].");
+		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+			ABORT("Malloc fails for SeedSTD_BC[].");
+
+		for (i=0;i<k;i++){
+			SeedSTD_BC[i]=rand();
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+		for (ljb = 0; ljb <k ; ++ljb) {
+			LBtree_ptr[ljb]=NULL;
+		}
+
+
+		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+			ABORT("Calloc fails for ActiveFlag[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+			jb = mycol+ljb*grid->npcol;  /* not sure */
+			if(jb<nsupers){
+			pc = PCOL( jb, grid );
+
+			istart = xlsub[ljb];
+			for (i = istart; i < xlsub[ljb+1]; ++i) {
+				irow = lsub[i];
+				gb = BlockNum( irow );
+				pr = PROW( gb, grid );
+				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
+			} /* for j ... */
+			}
+		}
+
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm);
+
+
+
+		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+
+			jb = mycol+ljb*grid->npcol;  /* not sure */
+			if(jb<nsupers){
+			pc = PCOL( jb, grid );
+
+			for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+			Root=-1;
+			Iactive = 0;
+			for (j=0;j<grid->nprow;++j){
+				if(ActiveFlag[j]!=3*nsupers){
+				gb = ActiveFlag[j];
+				pr = PROW( gb, grid );
+				if(gb==jb)Root=pr;
+				if(myrow==pr)Iactive=1;
+				}
+			}
+
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
+
+			if(Iactive==1){
+				// printf("jb %5d damn\n",jb);
+				// fflush(stdout);
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->nprow; ++j){
+					if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+						++rank_cnt;
+					}
+				}
+
+				if(rank_cnt>1){
+
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( jb );
+					LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+					BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+
+					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+					// fflush(stdout);
+
+					// if(iam==15 || iam==3){
+					// printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s'));
+					// fflush(stdout);
+					// }
+
+					// #if ( PRNTlevel>=1 )
+					if(Root==myrow){
+						rank_cnt_ref=1;
+						for (j = 0; j < grid->nprow; ++j) {
+							if ( fsendx_plist[ljb][j] != EMPTY ) {
+								++rank_cnt_ref;
+							}
+						}
+						assert(rank_cnt==rank_cnt_ref);
+
+						// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
+
+						// // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt);
+						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+						// // printf("\n");
+					}
+					// #endif
+				}
+			}
+			}
+		}
+
+
+		SUPERLU_FREE(ActiveFlag);
+		SUPERLU_FREE(ActiveFlagAll);
+		SUPERLU_FREE(ranks);
+		SUPERLU_FREE(SeedSTD_BC);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
+#endif
+
+
+#if ( PROFlevel>=1 )
+			t = SuperLU_timer_();
+#endif
+		/* construct the Reduce tree for L ... */
+		/* the following is used as reference */
+		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+		if ( !(mod_bit = intMalloc_dist(nlb)) )
+			ABORT("Malloc fails for mod_bit[].");
+		if ( !(frecv = intMalloc_dist(nlb)) )
+			ABORT("Malloc fails for frecv[].");
+
+		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+		for (k = 0; k < nsupers; ++k) {
+			pr = PROW( k, grid );
+			if ( myrow == pr ) {
+				lib = LBi( k, grid );    /* local block number */
+				kcol = PCOL( k, grid );
+				if (mycol == kcol || fmod[lib] )
+					mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+			}
+		}
+		/* Every process receives the count, but it is only useful on the
+		   diagonal processes.  */
+		MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+		if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+			ABORT("Malloc fails for LRtree_ptr[].");
+		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+			ABORT("Calloc fails for ActiveFlag[].");
+		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+			ABORT("Malloc fails for ranks[].");
+
+		// if ( !(idxs = intCalloc_dist(nsupers)) )
+			// ABORT("Calloc fails for idxs[].");
+
+		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+			// ABORT("Malloc fails for nzrows[].");
+
+		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+			ABORT("Malloc fails for SeedSTD_RD[].");
+
+		for (i=0;i<k;i++){
+			SeedSTD_RD[i]=rand();
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+
+		for (lib = 0; lib <k ; ++lib) {
+			LRtree_ptr[lib]=NULL;
+		}
+
+
+		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+
+
+		for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */
+			jb = mycol+ljb*grid->npcol;  /* not sure */
+			if(jb<nsupers){
+				pc = PCOL( jb, grid );
+				for(i=xlsub[ljb];i<xlsub[ljb+1];++i){
+					irow = lsub[i];
+					ib = BlockNum( irow );
+					pr = PROW( ib, grid );
+					if ( myrow == pr ) { /* Block row ib in my process row */
+						lib = LBi( ib, grid ); /* Local block number */
+						ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb);
+					}
+				}
+			}
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm);
+
+		for (lib=0;lib<k;++lib){
+			ib = myrow+lib*grid->nprow;  /* not sure */
+			if(ib<nsupers){
+				pr = PROW( ib, grid );
+				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+				Root=-1;
+				Iactive = 0;
+
+				for (j=0;j<grid->npcol;++j){
+					if(ActiveFlag[j]!=-3*nsupers){
+					jb = ActiveFlag[j];
+					pc = PCOL( jb, grid );
+					if(jb==ib)Root=pc;
+					if(mycol==pc)Iactive=1;
+					}
+				}
+
+
+				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
+
+				if(Iactive==1){
+					assert( Root>-1 );
+					rank_cnt = 1;
+					ranks[0]=Root;
+					for (j = 0; j < grid->npcol; ++j){
+						if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+							ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+							++rank_cnt;
+						}
+					}
+					if(rank_cnt>1){
+
+						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+						// rseed=rand();
+						// rseed=1.0;
+						msgsize = SuperSize( ib );
+
+						// if(ib==0){
+
+						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+						RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+						// }
+
+						// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
+						// fflush(stdout);
+
+
+						#if ( PRNTlevel>=1 )
+						if(Root==mycol){
+						assert(rank_cnt==frecv[lib]);
+						// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+						// printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+						// printf("\n");
+						}
+						#endif
+					}
+				}
+			}
+		}
+
+		SUPERLU_FREE(mod_bit);
+		SUPERLU_FREE(frecv);
+
+
+		SUPERLU_FREE(ActiveFlag);
+		SUPERLU_FREE(ActiveFlagAll);
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
+		// for(i=0;i<nsupers;++i){
+			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+		// }
+		// SUPERLU_FREE(nzrows);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+			////////////////////////////////////////////////////////
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+
+		/* construct the Bcast tree for U ... */
+
+		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+		if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+			ABORT("Malloc fails for UBtree_ptr[].");
+		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+			ABORT("Calloc fails for ActiveFlag[].");
+		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+			ABORT("Malloc fails for ranks[].");
+		if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+			ABORT("Malloc fails for SeedSTD_BC[].");
+
+		for (i=0;i<k;i++){
+			SeedSTD_BC[i]=rand();
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+
+		for (ljb = 0; ljb <k ; ++ljb) {
+			UBtree_ptr[ljb]=NULL;
+		}
+
+		if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+
+
+		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
+			ib = myrow+lib*grid->nprow;  /* not sure */
+
+		// if(ib==0)printf("iam %5d ib %5d\n",iam,ib);
+		// fflush(stdout);
+
+			if(ib<nsupers){
+				for (i = xusub[lib]; i < xusub[lib+1]; i++) {
+				  jcol = usub[i];
+				  jb = BlockNum( jcol );
+				  ljb = LBj( jb, grid );    /* local block number */
+				  pc = PCOL( jb, grid );
+				  pr = PROW( ib, grid );
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
+				  }
+				}  /* for i ... */
+				pr = PROW( ib, grid ); // take care of diagonal node stored as L
+				pc = PCOL( ib, grid );
+				if ( mycol == pc ) { /* Block column ib in my process column */
+					ljb = LBj( ib, grid );    /* local block number */
+					ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib);
+					// if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb);
+					// fflush(stdout);
+				}
+			}
+		}
+
+		// printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]);
+		// fflush(stdout);
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm);
+
+		for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
+			jb = mycol+ljb*grid->npcol;  /* not sure */
+			if(jb<nsupers){
+			pc = PCOL( jb, grid );
+			// if ( mycol == pc ) { /* Block column jb in my process column */
+
+			for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+			for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+			for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+			Root=-1;
+			Iactive = 0;
+			for (j=0;j<grid->nprow;++j){
+				if(ActiveFlag[j]!=-3*nsupers){
+				gb = ActiveFlag[j];
+				pr = PROW( gb, grid );
+				if(gb==jb)Root=pr;
+				if(myrow==pr)Iactive=1;
+				}
+			}
+
+			quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
+		// printf("jb: %5d Iactive %5d\n",jb,Iactive);
+		// fflush(stdout);
+			if(Iactive==1){
+				// if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]);
+				fflush(stdout);
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->nprow; ++j){
+					if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+						++rank_cnt;
+					}
+				}
+		// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
+		// fflush(stdout);
+				if(rank_cnt>1){
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( jb );
+					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+					BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+
+					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+					// fflush(stdout);
+
+					if(Root==myrow){
+					rank_cnt_ref=1;
+					for (j = 0; j < grid->nprow; ++j) {
+						// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
+						// fflush(stdout);
+						if ( bsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
+						}
+					}
+					// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
+					// fflush(stdout);
+					assert(rank_cnt==rank_cnt_ref);
+					}
+				}
+			}
+			}
+		}
+		SUPERLU_FREE(ActiveFlag);
+		SUPERLU_FREE(ActiveFlagAll);
+		SUPERLU_FREE(ranks);
+		SUPERLU_FREE(SeedSTD_BC);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_BC, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+			t = SuperLU_timer_();
+#endif
+		/* construct the Reduce tree for U ... */
+		/* the following is used as reference */
+		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+		if ( !(mod_bit = intMalloc_dist(nlb)) )
+			ABORT("Malloc fails for mod_bit[].");
+		if ( !(brecv = intMalloc_dist(nlb)) )
+			ABORT("Malloc fails for brecv[].");
+
+		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+		for (k = 0; k < nsupers; ++k) {
+			pr = PROW( k, grid );
+			if ( myrow == pr ) {
+				lib = LBi( k, grid );    /* local block number */
+				kcol = PCOL( k, grid );
+				if (mycol == kcol || bmod[lib] )
+					mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+			}
+		}
+		/* Every process receives the count, but it is only useful on the
+		   diagonal processes.  */
+		MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+		if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+			ABORT("Malloc fails for URtree_ptr[].");
+		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+			ABORT("Calloc fails for ActiveFlag[].");
+		if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+			ABORT("Malloc fails for ranks[].");
+
+		// if ( !(idxs = intCalloc_dist(nsupers)) )
+			// ABORT("Calloc fails for idxs[].");
+
+		// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+			// ABORT("Malloc fails for nzrows[].");
+
+		if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+			ABORT("Malloc fails for SeedSTD_RD[].");
+
+		for (i=0;i<k;i++){
+			SeedSTD_RD[i]=rand();
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+		for (lib = 0; lib <k ; ++lib) {
+			URtree_ptr[lib]=NULL;
+		}
+
+
+		if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+			ABORT("Calloc fails for ActiveFlagAll[].");
+		for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+
+		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
+			ib = myrow+lib*grid->nprow;  /* not sure */
+			if(ib<nsupers){
+				for (i = xusub[lib]; i < xusub[lib+1]; i++) {
+				  jcol = usub[i];
+				  jb = BlockNum( jcol );
+				  pc = PCOL( jb, grid );
+				  if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+				  }
+				}  /* for i ... */
+				pc = PCOL( ib, grid );
+				if ( mycol == pc ) { /* Block column ib in my process column */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],ib);
+				}
+			}
+		}
+
+		MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm);
+
+		for (lib=0;lib<k;++lib){
+			ib = myrow+lib*grid->nprow;  /* not sure */
+			if(ib<nsupers){
+				pr = PROW( ib, grid );
+				for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+				for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+				for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+				Root=-1;
+				Iactive = 0;
+
+				for (j=0;j<grid->npcol;++j){
+					if(ActiveFlag[j]!=3*nsupers){
+					jb = ActiveFlag[j];
+					pc = PCOL( jb, grid );
+					if(jb==ib)Root=pc;
+					if(mycol==pc)Iactive=1;
+					}
+				}
+
+				quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
+
+				if(Iactive==1){
+					assert( Root>-1 );
+					rank_cnt = 1;
+					ranks[0]=Root;
+					for (j = 0; j < grid->npcol; ++j){
+						if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+							ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+							++rank_cnt;
+						}
+					}
+					if(rank_cnt>1){
+
+						for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+							ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+						// rseed=rand();
+						// rseed=1.0;
+						msgsize = SuperSize( ib );
+
+						// if(ib==0){
+
+						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+						RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+						// }
+
+						// #if ( PRNTlevel>=1 )
+						if(Root==mycol){
+						// printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]);
+						// fflush(stdout);
+						assert(rank_cnt==brecv[lib]);
+						// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+						// printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+						// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+						// printf("\n");
+						}
+						// #endif
+					}
+				}
+			}
+		}
+
+		SUPERLU_FREE(mod_bit);
+		SUPERLU_FREE(brecv);
+
+
+		SUPERLU_FREE(ActiveFlag);
+		SUPERLU_FREE(ActiveFlagAll);
+		SUPERLU_FREE(ranks);
+		// SUPERLU_FREE(idxs);
+		SUPERLU_FREE(SeedSTD_RD);
+		// for(i=0;i<nsupers;++i){
+			// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+		// }
+		// SUPERLU_FREE(nzrows);
+		memTRS -= k*dword + grid->nprow*k*iword;  //acount for SeedSTD_RD, ActiveFlagAll
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
+#endif
+
+	////////////////////////////////////////////////////////
+
+  /* Free the memory used for storing L and U */
+  SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
+  if (lsub != NULL)
+    SUPERLU_FREE(lsub);
+  if (usub != NULL)
+    SUPERLU_FREE(usub);
+
+
+  SUPERLU_FREE(nnzToRecv);
+  SUPERLU_FREE(ptrToRecv);
+  SUPERLU_FREE(nnzToSend);
+  SUPERLU_FREE(ptrToSend);
+  SUPERLU_FREE(recvBuf);
+
+  Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+  Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+  Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+  Llu->Linv_bc_ptr = Linv_bc_ptr;
+  Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+  Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+  Llu->Unzval_br_ptr = Unzval_br_ptr;
+  Llu->Unnz = Unnz;
+  Llu->ToRecv = ToRecv;
+  Llu->ToSendD = ToSendD;
+  Llu->ToSendR = ToSendR;
+  Llu->fmod = fmod;
+  Llu->fsendx_plist = fsendx_plist;
+  Llu->nfrecvx = nfrecvx;
+  Llu->nfsendx = nfsendx;
+  Llu->bmod = bmod;
+  Llu->bsendx_plist = bsendx_plist;
+  Llu->nbrecvx = nbrecvx;
+  Llu->nbsendx = nbsendx;
+  Llu->ilsum = ilsum;
+  Llu->ldalsum = ldaspa;
+  LUstruct->Glu_persist = Glu_persist;
+  Llu->LRtree_ptr = LRtree_ptr;
+  Llu->LBtree_ptr = LBtree_ptr;
+  Llu->URtree_ptr = URtree_ptr;
+  Llu->UBtree_ptr = UBtree_ptr;
+  Llu->Urbs = Urbs;
+  Llu->Ucb_indptr = Ucb_indptr;
+  Llu->Ucb_valptr = Ucb_valptr;
+
+#if ( PRNTlevel>=1 )
+  if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+		     nLblocks, nUblocks);
+#endif
+
+  k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+      ABORT("Malloc fails for mod_bit[].");
+
+  /* Find the maximum buffer size. */
+  MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
+		MPI_MAX, grid->comm);
+
+#if ( DEBUGlevel>=1 )
+  /* Memory allocated but not freed:
+     ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
+     ToRecv, ToSendR, ToSendD, mod_bit
+  */
+  CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+#endif
+
+  return (- (memDist+memNLU));
+} /* sdist_psymbtonum */
+
diff --git a/SRC/psutil.c b/SRC/psutil.c
new file mode 100644
index 00000000..a3a384a4
--- /dev/null
+++ b/SRC/psutil.c
@@ -0,0 +1,896 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * </pre>
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
+ */
+int psCompRow_loc_to_CompCol_global
+(
+ int_t need_value, /* Input. Whether need to gather numerical values */
+ SuperMatrix *A,   /* Input. Distributed matrix in NRformat_loc format. */
+ gridinfo_t *grid, /* Input */
+ SuperMatrix *GA   /* Output */
+)
+{
+    NRformat_loc *Astore;
+    NCformat *GAstore;
+    float *a, *a_loc;
+    int_t *colind, *rowptr;
+    int_t *colptr_loc, *rowind_loc;
+    int_t m_loc, n, i, j, k, l;
+    int_t colnnz, fst_row, nnz_loc, nnz;
+    float *a_recv;  /* Buffer to receive the blocks of values. */
+    float *a_buf;   /* Buffer to merge blocks into block columns. */
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
+			   local block rows.
+			   Use n_loc+1 pointers for each block. */
+    int_t *colptr_blk;  /* The column pointers for each block, after
+			   redistribution to the local block columns.
+			   Use n_loc+1 pointers for each block. */
+    int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
+    int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
+    int_t *fst_rows, *n_locs;
+    int   *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32;
+    int   it, n_loc, procs;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psCompRow_loc_to_CompCol_global");
+#endif
+
+    /* Initialization. */
+    n = A->ncol;
+    Astore = (NRformat_loc *) A->Store;
+    nnz_loc = Astore->nnz_loc;
+    m_loc = Astore->m_loc;
+    fst_row = Astore->fst_row;
+    a = Astore->nzval;
+    rowptr = Astore->rowptr;
+    colind = Astore->colind;
+    n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */
+
+    /* ------------------------------------------------------------
+       FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN.
+       ------------------------------------------------------------*/
+    sCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc,
+                             &rowind_loc, &colptr_loc);
+    /* Change local row index numbers to global numbers. */
+    for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row;
+
+#if ( DEBUGlevel>=2 )
+    printf("Proc %d\n", grid->iam);
+    PrintInt10("rowind_loc", nnz_loc, rowind_loc);
+    PrintInt10("colptr_loc", n+1, colptr_loc);
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) )
+	  ABORT("Malloc fails for fst_rows[]");
+    n_locs = fst_rows + procs;
+    MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t,
+		  grid->comm);
+    for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i];
+    n_locs[procs-1] = n - fst_rows[procs-1];
+    if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) )
+	  ABORT("Malloc fails for recvcnts[]");
+    sendcnts = recvcnts + procs;
+    rdispls = sendcnts + procs;
+    sdispls = rdispls + procs;
+    itemp_32 = sdispls + procs;
+
+    /* All-to-all transfer column pointers of each block.
+       Now the matrix view is P-by-P block-partition. */
+    /* n column starts for each column, and procs column ends for each block */
+    if ( !(colptr_send = intMalloc_dist(n + procs)) )
+	   ABORT("Malloc fails for colptr_send[]");
+    if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) )
+	   ABORT("Malloc fails for colptr_blk[]");
+    for (i = 0, j = 0; i < procs; ++i) {
+        for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k];
+	colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */
+	sendcnts[i] = n_locs[i] + 1;
+#if ( DEBUGlevel>=1 )
+	assert(j == fst_rows[i]);
+#endif
+	sdispls[i] = j + i;
+	recvcnts[i] = n_loc + 1;
+	rdispls[i] = i * (n_loc + 1);
+	j += n_locs[i]; /* First column of next block in colptr_loc[] */
+    }
+    MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t,
+		  colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+    /* Adjust colptr_blk[] so that they contain the local indices of the
+       column pointers in the receive buffer. */
+    nnz = 0; /* The running sum of the nonzeros counted by far */
+    k = 0;
+    for (i = 0; i < procs; ++i) {
+	for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) {
+	    colnnz = colptr_blk[j+1] - colptr_blk[j];
+	    /*assert(k<=j);*/
+	    colptr_blk[k] = nnz;
+	    nnz += colnnz; /* Start of the next column */
+	    ++k;
+	}
+	colptr_blk[k++] = nnz; /* Add an END marker for each block */
+    }
+    /*assert(k == (n_loc+1)*procs);*/
+
+    /* Now prepare to transfer row indices and values. */
+    sdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]];
+	sdispls[i+1] = sdispls[i] + sendcnts[i];
+    }
+    sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]];
+    for (i = 0; i < procs; ++i) {
+        j = rdispls[i]; /* Point to this block in colptr_blk[]. */
+	recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j];
+    }
+    rdispls[0] = 0; /* Recompute rdispls[] for row indices. */
+    for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i];
+
+    k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */
+    if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) )
+        ABORT("Malloc fails for rowind_recv[]");
+    rowind_buf = rowind_recv + k;
+    MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t,
+		  rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+        if ( !(a_recv = (float *) floatMalloc_dist(2*k)) )
+	    ABORT("Malloc fails for rowind_recv[]");
+	a_buf = a_recv + k;
+	MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_FLOAT,
+                      a_recv, recvcnts, rdispls, MPI_FLOAT,
+                      grid->comm);
+    }
+
+    /* Reset colptr_loc[] to point to the n_loc global columns. */
+    colptr_loc[0] = 0;
+    itemp = colptr_send;
+    for (j = 0; j < n_loc; ++j) {
+        colnnz = 0;
+	for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1) + j; /* j-th column in i-th block */
+	    colnnz += colptr_blk[k+1] - colptr_blk[k];
+	}
+	colptr_loc[j+1] = colptr_loc[j] + colnnz;
+	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
+    }
+    itemp[n_loc] = colptr_loc[n_loc];
+
+    /* Merge blocks of row indices into columns of row indices. */
+    for (i = 0; i < procs; ++i) {
+        k = i * (n_loc + 1);
+	for (j = 0; j < n_loc; ++j) { /* i-th block */
+	    for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+	        rowind_buf[itemp[j]] = rowind_recv[l];
+		++itemp[j];
+	    }
+	}
+    }
+
+    if ( need_value ) {
+        for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j];
+        for (i = 0; i < procs; ++i) {
+	    k = i * (n_loc + 1);
+	    for (j = 0; j < n_loc; ++j) { /* i-th block */
+	        for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) {
+		    a_buf[itemp[j]] = a_recv[l];
+		    ++itemp[j];
+		}
+	    }
+	}
+    }
+
+    /* ------------------------------------------------------------
+       SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT.
+       ------------------------------------------------------------*/
+    GA->nrow  = A->nrow;
+    GA->ncol  = A->ncol;
+    GA->Stype = SLU_NC;
+    GA->Dtype = A->Dtype;
+    GA->Mtype = A->Mtype;
+    GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) );
+    if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore");
+
+    /* First gather the size of each piece. */
+    nnz_loc = colptr_loc[n_loc];
+    MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
+    for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
+    GAstore->nnz = nnz;
+
+    if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
+    if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
+        ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
+
+    /* Allgatherv for row indices. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + itemp[i];
+        itemp_32[i] = itemp[i];
+    }
+    itemp_32[procs-1] = itemp[procs-1];
+    it = nnz_loc;
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+    if ( need_value ) {
+      if ( !(GAstore->nzval = (float *) floatMalloc_dist (nnz)) )
+          ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
+      MPI_Allgatherv(a_buf, it, MPI_FLOAT, GAstore->nzval,
+		     itemp_32, rdispls, MPI_FLOAT, grid->comm);
+    } else GAstore->nzval = NULL;
+
+    /* Now gather the column pointers. */
+    rdispls[0] = 0;
+    for (i = 0; i < procs-1; ++i) {
+        rdispls[i+1] = rdispls[i] + n_locs[i];
+        itemp_32[i] = n_locs[i];
+    }
+    itemp_32[procs-1] = n_locs[procs-1];
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
+		   itemp_32, rdispls, mpi_int_t, grid->comm);
+
+    /* Recompute column pointers. */
+    for (i = 1; i < procs; ++i) {
+        k = rdispls[i];
+	for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1];
+	itemp[i] += itemp[i-1]; /* prefix sum */
+    }
+    GAstore->colptr[n] = nnz;
+
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+        printf("After pdCompRow_loc_to_CompCol_global()\n");
+	sPrint_CompCol_Matrix_dist(GA);
+    }
+#endif
+
+    SUPERLU_FREE(a_loc);
+    SUPERLU_FREE(rowind_loc);
+    SUPERLU_FREE(colptr_loc);
+    SUPERLU_FREE(fst_rows);
+    SUPERLU_FREE(recvcnts);
+    SUPERLU_FREE(colptr_send);
+    SUPERLU_FREE(colptr_blk);
+    SUPERLU_FREE(rowind_recv);
+    if ( need_value) SUPERLU_FREE(a_recv);
+#if ( DEBUGlevel>=1 )
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
+    CHECK_MALLOC(grid->iam, "Exit psCompRow_loc_to_CompCol_global");
+#endif
+    return 0;
+} /* psCompRow_loc_to_CompCol_global */
+
+
+/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B.
+ */
+int psPermute_Dense_Matrix
+(
+ int_t fst_row,
+ int_t m_loc,
+ int_t row_to_proc[],
+ int_t perm[],
+ float X[], int ldx,
+ float B[], int ldb,
+ int nrhs,
+ gridinfo_t *grid
+)
+{
+    int_t i, j, k, l;
+    int p, procs;
+    int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *send_ibuf, *recv_ibuf;
+    float *send_dbuf, *recv_dbuf;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Enter psPermute_Dense_Matrix()");
+#endif
+
+    procs = grid->nprow * grid->npcol;
+    if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) )
+        ABORT("Malloc fails for sendcnts[].");
+    sendcnts_nrhs = sendcnts + procs;
+    recvcnts = sendcnts_nrhs + procs;
+    recvcnts_nrhs = recvcnts + procs;
+    sdispls = recvcnts_nrhs + procs;
+    sdispls_nrhs = sdispls + procs;
+    rdispls = sdispls_nrhs + procs;
+    rdispls_nrhs = rdispls + procs;
+    ptr_to_ibuf = rdispls_nrhs + procs;
+    ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    for (i = 0; i < procs; ++i) sendcnts[i] = 0;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        p = row_to_proc[perm[i]];
+	++sendcnts[p];
+    }
+    MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    sendcnts_nrhs[0] = sendcnts[0] * nrhs;
+    recvcnts_nrhs[0] = recvcnts[0] * nrhs;
+    for (i = 1; i < procs; ++i) {
+        sdispls[i] = sdispls[i-1] + sendcnts[i-1];
+	sdispls_nrhs[i] = sdispls[i] * nrhs;
+	rdispls[i] = rdispls[i-1] + recvcnts[i-1];
+	rdispls_nrhs[i] = rdispls[i] * nrhs;
+	sendcnts_nrhs[i] = sendcnts[i] * nrhs;
+	recvcnts_nrhs[i] = recvcnts[i] * nrhs;
+    }
+    k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */
+    l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */
+    /*assert(k == m_loc);*/
+    /*assert(l == m_loc);*/
+    if ( !(send_ibuf = intMalloc_dist(k + l)) )
+        ABORT("Malloc fails for send_ibuf[].");
+    recv_ibuf = send_ibuf + k;
+    if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) )
+        ABORT("Malloc fails for send_dbuf[].");
+    recv_dbuf = send_dbuf + k * nrhs;
+
+    for (i = 0; i < procs; ++i) {
+        ptr_to_ibuf[i] = sdispls[i];
+	ptr_to_dbuf[i] = sdispls_nrhs[i];
+    }
+
+    /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */
+    for (i = fst_row; i < fst_row + m_loc; ++i) {
+        j = perm[i];
+	p = row_to_proc[j];
+	send_ibuf[ptr_to_ibuf[p]] = j;
+	j = ptr_to_dbuf[p];
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    send_dbuf[j++] = X[i-fst_row + k*ldx];
+	}
+	++ptr_to_ibuf[p];
+	ptr_to_dbuf[p] += nrhs;
+    }
+
+    /* Transfer the (permuted) row indices and numerical values. */
+    MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
+		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+    MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_FLOAT,
+		  recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_FLOAT,
+		  grid->comm);
+
+    /* Copy the buffer into b. */
+    for (i = 0, l = 0; i < m_loc; ++i) {
+        j = recv_ibuf[i] - fst_row; /* Relative row number */
+	RHS_ITERATE(k) { /* RHS stored in row major in the buffer */
+	    B[j + k*ldb] = recv_dbuf[l++];
+	}
+    }
+
+    SUPERLU_FREE(sendcnts);
+    SUPERLU_FREE(send_ibuf);
+    SUPERLU_FREE(send_dbuf);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid->iam, "Exit psPermute_Dense_Matrix()");
+#endif
+    return 0;
+} /* psPermute_Dense_Matrix */
+
+
+/*! \brief Allocate storage in LUstruct */
+void sLUstructInit(const int_t n, sLUstruct_t *LUstruct)
+{
+    if ( !(LUstruct->etree = intMalloc_dist(n)) )
+	ABORT("Malloc fails for etree[].");
+    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
+	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
+	ABORT("Malloc fails for Glu_persist_t.");
+    if ( !(LUstruct->Llu = (sLocalLU_t *)
+	   SUPERLU_MALLOC(sizeof(sLocalLU_t))) )
+	ABORT("Malloc fails for LocalLU_t.");
+	LUstruct->Llu->inv = 0;
+}
+
+/*! \brief Deallocate LUstruct */
+void sLUstructFree(sLUstruct_t *LUstruct)
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter sLUstructFree()");
+#endif
+
+    SUPERLU_FREE(LUstruct->etree);
+    SUPERLU_FREE(LUstruct->Glu_persist);
+    SUPERLU_FREE(LUstruct->Llu);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit sLUstructFree()");
+#endif
+}
+
+/*! \brief Destroy distributed L & U matrices. */
+void
+sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter sDestroy_LU()");
+#endif
+
+    sDestroy_Tree(n, grid, LUstruct);
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc    
+    //#ifdef GPU_ACC
+	    checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i]));
+#endif
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nb = CEILING(nsupers, grid->nprow);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    SUPERLU_FREE(Llu->ToSendR[0]);
+    SUPERLU_FREE(Llu->ToSendR);
+
+    /* The following can be freed only after iterative refinement. */
+    SUPERLU_FREE(Llu->ilsum);
+    SUPERLU_FREE(Llu->fmod);
+    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE(Llu->fsendx_plist);
+    SUPERLU_FREE(Llu->bmod);
+    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE(Llu->bsendx_plist);
+    SUPERLU_FREE(Llu->mod_bit);
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+	}	
+    SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0; i<nb; ++i) {
+	if(Llu->Linv_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+	}
+	if(Llu->Uinv_bc_ptr[i]!=NULL){
+	    SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+	}	
+    }
+    SUPERLU_FREE(Llu->Linv_bc_ptr);
+    SUPERLU_FREE(Llu->Uinv_bc_ptr);
+    SUPERLU_FREE(Llu->Unnz);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Urbs[i] ) {
+	    SUPERLU_FREE(Llu->Ucb_indptr[i]);
+	    SUPERLU_FREE(Llu->Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Llu->Ucb_indptr);
+    SUPERLU_FREE(Llu->Ucb_valptr);	
+    SUPERLU_FREE(Llu->Urbs);
+
+    SUPERLU_FREE(Glu_persist->xsup);
+    SUPERLU_FREE(Glu_persist->supno);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit sDestroy_LU()");
+#endif
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * </pre>
+ */
+int_t
+psgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
+	     int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
+	     Glu_persist_t *Glu_persist, sSOLVEstruct_t *SOLVEstruct)
+{
+
+    int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
+    int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
+    int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
+    int_t *row_to_proc;
+    int_t i, gbi, k, l, num_diag_procs, *diag_procs;
+    int_t irow, q, knsupc, nsupers, *xsup, *supno;
+    int   iam, p, pkk, procs;
+    pxgstrs_comm_t *gstrs_comm;
+
+    procs = grid->nprow * grid->npcol;
+    iam = grid->iam;
+    gstrs_comm = SOLVEstruct->gstrs_comm;
+    xsup = Glu_persist->xsup;
+    supno = Glu_persist->supno;
+    nsupers = Glu_persist->supno[n-1] + 1;
+    row_to_proc = SOLVEstruct->row_to_proc;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
+       ------------------------------------------------------------*/
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for B_to_X_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of elements to be sent to each diagonal process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
+        irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
+	gbi = BlockNum( irow );
+	p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
+	++SendCnt[p];
+    }
+  
+    /* Set up the displacements for alltoall. */
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+    }
+    for (p = 0; p < procs; ++p) {
+        SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	sdispls_nrhs[p] = sdispls[p] * nrhs;
+        RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+	rdispls_nrhs[p] = rdispls[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->B_to_X_SendCnt = SendCnt;
+
+    /* ------------------------------------------------------------
+       SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
+       ------------------------------------------------------------*/
+    /* This is freed in pxgstrs_finalize(). */
+    if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
+        ABORT("Malloc fails for X_to_B_itemp[].");
+    SendCnt      = itemp;
+    SendCnt_nrhs = itemp +   procs;
+    RecvCnt      = itemp + 2*procs;
+    RecvCnt_nrhs = itemp + 3*procs;
+    sdispls      = itemp + 4*procs;
+    sdispls_nrhs = itemp + 5*procs;
+    rdispls      = itemp + 6*procs;
+    rdispls_nrhs = itemp + 7*procs;
+
+    /* Count the number of X entries to be sent to each process.*/
+    for (p = 0; p < procs; ++p) SendCnt[p] = 0;
+    num_diag_procs = SOLVEstruct->num_diag_procs;
+    diag_procs = SOLVEstruct->diag_procs;
+
+    for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
+	pkk = diag_procs[p];
+	if ( iam == pkk ) {
+	    for (k = p; k < nsupers; k += num_diag_procs) {
+		knsupc = SuperSize( k );
+		irow = FstBlockC( k );
+		for (i = 0; i < knsupc; ++i) {
+#if 0
+		    q = row_to_proc[inv_perm_c[irow]];
+#else
+		    q = row_to_proc[irow];
+#endif
+		    ++SendCnt[q];
+		    ++irow;
+		}
+	    }
+	}
+    }
+
+    MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
+    sdispls[0] = rdispls[0] = 0;
+    sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
+    SendCnt_nrhs[0] = SendCnt[0] * nrhs;
+    RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
+    for (p = 1; p < procs; ++p) {
+        sdispls[p] = sdispls[p-1] + SendCnt[p-1];
+        rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
+        sdispls_nrhs[p] = sdispls[p] * nrhs;
+        rdispls_nrhs[p] = rdispls[p] * nrhs;
+	SendCnt_nrhs[p] = SendCnt[p] * nrhs;
+	RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
+    }
+
+    /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
+    gstrs_comm->X_to_B_SendCnt = SendCnt;
+
+    if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
+        ABORT("Malloc fails for ptr_to_ibuf[].");
+    gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
+    gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;
+
+    return 0;
+} /* PSGSTRS_INIT */
+
+
+/*! \brief Initialize the data structure for the solution phase.
+ */
+int sSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
+	       int_t perm_r[], int_t perm_c[], int_t nrhs,
+	       sLUstruct_t *LUstruct, gridinfo_t *grid,
+	       sSOLVEstruct_t *SOLVEstruct)
+{
+    int_t *row_to_proc, *inv_perm_c, *itemp;
+    NRformat_loc *Astore;
+    int_t        i, fst_row, m_loc, p;
+    int          procs;
+
+    Astore = (NRformat_loc *) A->Store;
+    fst_row = Astore->fst_row;
+    m_loc = Astore->m_loc;
+    procs = grid->nprow * grid->npcol;
+
+    if ( !(row_to_proc = intMalloc_dist(A->nrow)) )
+	ABORT("Malloc fails for row_to_proc[]");
+    SOLVEstruct->row_to_proc = row_to_proc;
+    if ( !(inv_perm_c = intMalloc_dist(A->ncol)) )
+        ABORT("Malloc fails for inv_perm_c[].");
+    for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i;
+    SOLVEstruct->inv_perm_c = inv_perm_c;
+
+    /* ------------------------------------------------------------
+       EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION.
+       SET UP THE MAPPING BETWEEN ROWS AND PROCESSES.
+
+       NOTE: For those processes that do not own any row, it must
+             must be set so that fst_row == A->nrow.
+       ------------------------------------------------------------*/
+    if ( !(itemp = intMalloc_dist(procs+1)) )
+        ABORT("Malloc fails for itemp[]");
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = A->nrow;
+    for (p = 0; p < procs; ++p) {
+        for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p;
+    }
+#if ( DEBUGlevel>=2 )
+    if ( !grid->iam ) {
+      printf("fst_row = %d\n", fst_row);
+      PrintInt10("row_to_proc", A->nrow, row_to_proc);
+      PrintInt10("inv_perm_c", A->ncol, inv_perm_c);
+    }
+#endif
+    SUPERLU_FREE(itemp);
+
+#if 0
+    /* Compute the mapping between rows and processes. */
+    /* XSL NOTE: What happens if # of mapped processes is smaller
+       than total Procs?  For the processes without any row, let
+       fst_row be EMPTY (-1). Make sure this case works! */
+    MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t,
+		  grid->comm);
+    itemp[procs] = n;
+    for (p = 0; p < procs; ++p) {
+        j = itemp[p];
+	if ( j != EMPTY ) {
+	    k = itemp[p+1];
+	    if ( k == EMPTY ) k = n;
+	    for (i = j ; i < k; ++i) row_to_proc[i] = p;
+	}
+    }
+#endif
+
+    get_diag_procs(A->ncol, LUstruct->Glu_persist, grid,
+		   &SOLVEstruct->num_diag_procs,
+		   &SOLVEstruct->diag_procs,
+		   &SOLVEstruct->diag_len);
+
+    /* Setup communication pattern for redistribution of B and X. */
+    if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *)
+	   SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) )
+        ABORT("Malloc fails for gstrs_comm[]");
+    psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid,
+		 LUstruct->Glu_persist, SOLVEstruct);
+
+    if ( !(SOLVEstruct->gsmv_comm = (psgsmv_comm_t *)
+           SUPERLU_MALLOC(sizeof(psgsmv_comm_t))) )
+        ABORT("Malloc fails for gsmv_comm[]");
+    SOLVEstruct->A_colind_gsmv = NULL;
+
+    options->SolveInitialized = YES;
+    return 0;
+} /* sSolveInit */
+
+/*! \brief Release the resources used for the solution phase.
+ */
+void sSolveFinalize(superlu_dist_options_t *options, sSOLVEstruct_t *SOLVEstruct)
+{
+    if ( options->SolveInitialized ) {
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+        if ( options->RefineInitialized ) {
+            psgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
+    }
+} /* sSolveFinalize */
+
+void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->row_counts_int);  // free displacements and counts 
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(A3d->nnz_counts_int);
+    SUPERLU_FREE(A3d->nnz_disp);
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* sDestroy_A3d_gathered_on_2d */
+
+
+/*! \brief Check the inf-norm of the error vector
+ */
+void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx,
+		      float xtrue[], int_t ldxtrue, MPI_Comm slucomm)
+{
+    float err, xnorm, temperr, tempxnorm;
+    float *x_work, *xtrue_work;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
+	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
+      }
+
+      /* get the golbal max err & xnrom */
+      temperr = err;
+      tempxnorm = xnorm;
+      MPI_Allreduce( &temperr, &err, 1, MPI_FLOAT, MPI_MAX, slucomm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_FLOAT, MPI_MAX, slucomm);
+
+      err = err / xnorm;
+      if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
+/*! \brief Destroy broadcast and reduction trees used in triangular solve */
+void
+sDestroy_Tree(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter Destroy_Tree()");
+#endif
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0;i<nb;++i){
+	if(Llu->LBtree_ptr[i]!=NULL){
+		BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+	}
+	if(Llu->UBtree_ptr[i]!=NULL){
+		BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
+	}		
+    }
+    SUPERLU_FREE(Llu->LBtree_ptr);
+    SUPERLU_FREE(Llu->UBtree_ptr);
+	
+    nb = CEILING(nsupers, grid->nprow);
+    for (i=0;i<nb;++i){
+	if(Llu->LRtree_ptr[i]!=NULL){
+		RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+	}
+	if(Llu->URtree_ptr[i]!=NULL){
+		RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
+	}		
+    }
+    SUPERLU_FREE(Llu->LRtree_ptr);
+    SUPERLU_FREE(Llu->URtree_ptr);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit sDestroy_Tree()");
+#endif
+}
+
+
diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c
index 597c364a..fe2b722c 100644
--- a/SRC/psymbfact.c
+++ b/SRC/psymbfact.c
@@ -353,8 +353,8 @@ float symbfact_dist
 #endif
 
     /* Allocate storage common to the symbolic factor routines */
-    if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, 
-				&Llu_symbfact, &VInfo, &CS, &PS)) 
+    if ((iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, 
+				 &Llu_symbfact, &VInfo, &CS, &PS))) 
       return (PS.allocMem);
     /* Copy the redistributed input matrix AS at the end of the memory buffer
        allocated to store L and U.  That is, copy (AS.x_ainf, AS.ind_ainf) in
@@ -2067,15 +2067,15 @@ symbfact_vtx
 	/* TEST available memory */
 	if (next >= x_aind_end) {	
 	  if (domain_symb) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, 0,
-				      computeL, DOMAIN_SYMB, 1, 
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
-	      return (mem_error);
-	  } else if (mem_error =
-		     psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, 
-					computeL, LL_SYMB, 1, 
-					Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      if ( (mem_error =
+		    psymbfact_LUXpandMem (iam, n, vtx, next, 0,
+					  computeL, DOMAIN_SYMB, 1, 
+					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
+		  return (mem_error);
+	  } else if ( (mem_error =
+		       psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, 
+					  computeL, LL_SYMB, 1, 
+					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	    return (mem_error);
 
 	  x_aind_end = xsub[vtx_lid + 1];
@@ -2246,9 +2246,9 @@ updateRcvd_prGraph
 
   /* test if enough memory in usubPr array */
   if (ind >= szsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS))
-      return (mem_error);
+      if ( (mem_error = 
+	    psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) )
+	  return (mem_error);
     if (computeL) 
       subPr = Llu_symbfact->lsubPr;  
     else 
@@ -2367,9 +2367,9 @@ update_prGraph
     if (sn_elt < lstVtx_blk) {
       sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset;
       if ((*p_indsubPr) + 2 >= szsubPr) {
-	if (mem_error = 
-	    psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS))
-	  return (mem_error);
+	  if ( (mem_error = 
+		psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) )
+	      return (mem_error);
 	if (computeL) {
 	  subPr = Llu_symbfact->lsubPr;  szsubPr = Llu_symbfact->szLsubPr;
 	}
@@ -2523,16 +2523,16 @@ blk_symbfact
 
     prval_curvtx   = n;
     /* Compute nonzero structure L(:,vtx) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, 
-		      fstVtx_blk,  lstVtx,
-		      snrep_lid, szsn, &nextl,
-		      marker, 
-		      lsub_rcvd, lsub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l,
-		      &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, 
-		      &neltsMatched_L, markl1_vtx, &prval_curvtx, 
-		      vtx_bel_snU, &vtx_bel_snL))
+    if ( (mem_error = 
+	  symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, 
+			fstVtx_blk,  lstVtx,
+			snrep_lid, szsn, &nextl,
+			marker, 
+			lsub_rcvd, lsub_rcvd_sz,
+			Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l,
+			&neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, 
+			&neltsMatched_L, markl1_vtx, &prval_curvtx, 
+			vtx_bel_snU, &vtx_bel_snL)) )
       return (mem_error);
     lsub = Llu_symbfact->lsub;
 
@@ -2541,17 +2541,17 @@ blk_symbfact
 #endif
     
     /* Compute nonzero structure of U(vtx,:) */
-    if (mem_error = 
-	symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, 
-		      fstVtx_blk, lstVtx,
-		      snrep_lid, szsn, &nextu,
-		      marker, 
-		      usub_rcvd, usub_rcvd_sz,
-		      Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u,
-		      &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, 
-		      &neltsMatched_U, marku1_vtx, &prval_curvtx,
-		      vtx_bel_snL, &vtx_bel_snU))
-      return (mem_error);
+    if ( (mem_error = 
+	  symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, 
+			fstVtx_blk, lstVtx,
+			snrep_lid, szsn, &nextu,
+			marker, 
+			usub_rcvd, usub_rcvd_sz,
+			Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u,
+			&neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, 
+			&neltsMatched_U, marku1_vtx, &prval_curvtx,
+			vtx_bel_snL, &vtx_bel_snU)) )
+	return (mem_error);
     usub = Llu_symbfact->usub;
 
 #ifdef TEST_SYMB
@@ -2618,11 +2618,11 @@ blk_symbfact
 	*p_nextu      = xusub[vtx_lid];
 	nsuper_loc   += 1;
 	*p_nsuper_loc = nsuper_loc;
-	if (mem_error =
-	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-				sizes, fstVtxSep, vtx,
-				Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-				p_nextl, p_nextu, p_nsuper_loc))
+	if ( (mem_error =
+	      dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
+				  sizes, fstVtxSep, vtx,
+				  Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+				  p_nextl, p_nextu, p_nsuper_loc)) )
 	  return (mem_error);
 	/* set up neltsZr and neltsTotal */
 	vtx = lstVtx_blk;
@@ -3157,7 +3157,8 @@ expand_RL
 	
 	if (!computeL)
 	  marker[vtx] = markl;
-	for (ii; ii < mpnelts; ii++) {
+	//for (ii; ii < mpnelts; ii++) {  // Sherry: compiler warning
+	for (; ii < mpnelts; ii++) {
 	  elt = lsub_rcvd[ii];
 	  if (elt >= vtx) {
 	    if (marker[elt] != markl) {
@@ -3201,10 +3202,10 @@ expand_RL
   }
 
   nextl = xlsub[vtxXp_lid+1];  
-  if (mem_error = 
-      psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, 
-			    computeL, Pslu_freeable, Llu_symbfact, VInfo, PS))
-    return (mem_error);		
+  if ( (mem_error = 
+	psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, 
+			      computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) )
+      return (mem_error);
 
   return 0;
 }
@@ -3332,9 +3333,9 @@ rl_update
 
   /* test if enough memory in usubPr array */
   if (ind > Llu_symbfact->szLsubPr) {
-    if (mem_error = 
-	psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS))
-      return (mem_error);
+      if ( (mem_error = 
+	    psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) )
+	  return (mem_error);
     usubPr  = Llu_symbfact->lsubPr;
   }
   
@@ -3456,19 +3457,20 @@ rl_update
 	if (!computeL)
 	  marker[vtx] = markl;
 	PS->nops += mpnelts - ii;
-	for (ii; ii < mpnelts; ii++) {
+	//for (ii; ii < mpnelts; ii++) { // Sherry: compiler warning
+	for (; ii < mpnelts; ii++) {
 	  elt = lsub_rcvd[ii];
 	  if (elt >= vtx) {
 	    if (marker[elt] != markl) {
 	      /* add elt to structure of vtx */
 	      if (nextl >= xlsub[vtx_lid + 1]) {
-		if (mem_error = 
-		    expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz,
-			       usub_rcvd, usub_rcvd_sz, vtx, i,
-			       lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd,
-			       fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL,
-			       &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS))
-		    return (mem_error);
+		  if ( (mem_error = 
+			expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz,
+				   usub_rcvd, usub_rcvd_sz, vtx, i,
+				   lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd,
+				   fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL,
+				   &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) )
+		      return (mem_error);
 		if (computeL) {
 		  lsub    = Llu_symbfact->lsub;
 		  if (!computeRcvd) 
@@ -3565,20 +3567,20 @@ dnsUpSeps_symbfact
     else 
       vtx_elt = fstVtx_lvl;
     if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 
-				nextl + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
+	if ( (mem_error =
+	      psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 
+				    nextl + fstVtx_lvl - vtx_elt,
+				    LSUB, DNS_UPSEPS, 1,
+				    Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	return (mem_error);
       lsub = Llu_symbfact->lsub;
     }
     if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) {
-      if (mem_error =
-	  psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 
-				nextu + fstVtx_lvl - vtx_elt,
-				LSUB, DNS_UPSEPS, 1,
-				Pslu_freeable, Llu_symbfact, VInfo, PS))
+	if ( (mem_error =
+	      psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 
+				    nextu + fstVtx_lvl - vtx_elt,
+				    LSUB, DNS_UPSEPS, 1,
+				    Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	return (mem_error);
       usub = Llu_symbfact->usub;
     }
@@ -3610,10 +3612,10 @@ dnsUpSeps_symbfact
 	if (lsub[k] >= fstVtx_blk) {
 	  lsub[nextl] = lsub[k]; nextl ++;
 	  if (nextl >= MEM_LSUB( Llu_symbfact, VInfo ))
- 	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0,
-				      LSUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      if ( (mem_error =
+		    psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0,
+					  LSUB, DNS_UPSEPS, 1,
+					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	      return (mem_error);
 	  lsub = Llu_symbfact->lsub;
 	}
@@ -3621,10 +3623,10 @@ dnsUpSeps_symbfact
 	if (usub[k] > fstVtx_blk) {
 	  usub[nextu] = usub[k]; nextu ++;
 	  if (nextu >= MEM_USUB( Llu_symbfact, VInfo ))
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0,
-				      USUB, DNS_UPSEPS, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      if ( (mem_error =
+		    psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0,
+					  USUB, DNS_UPSEPS, 1,
+					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	      return (mem_error);
 	  usub = Llu_symbfact->usub;
 	}
@@ -3892,10 +3894,10 @@ dnsCurSep_symbfact
 	  j = x_newelts[vtx_lid_x+1] + lstVtx - vtx;
 	  if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) ||
 	      (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) {
-	    if (mem_error =
-		psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
-				      computeL, DNS_CURSEP, 1,
-				      Pslu_freeable, Llu_symbfact, VInfo, PS))
+	      if ( (mem_error =
+		    psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
+					  computeL, DNS_CURSEP, 1,
+					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	      return (mem_error);
 	    if (computeL) sub = Llu_symbfact->lsub;
 	    else sub = Llu_symbfact->usub; 
@@ -4131,19 +4133,19 @@ denseSep_symbfact
   }
 
   if (VInfo->filledSep == FILLED_SEP) {
-    if (mem_error = 
-	dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, 
-			    szSep, lstP - fstP, rcvd_dnsSep, p_nextl, 
-			    p_nextu, p_mark, p_nsuper_loc, marker, ndCom,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS))
+      if ( (mem_error = 
+	    dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, 
+				szSep, lstP - fstP, rcvd_dnsSep, p_nextl, 
+				p_nextu, p_mark, p_nsuper_loc, marker, ndCom,
+				Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) )
       return (mem_error);
   }
   else if (rcvd_dnsSep) 
-    if (mem_error = 
-	dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
-			    sizes, fstVtxSep, EMPTY,
-			    Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			    p_nextl, p_nextu, p_nsuper_loc))
+      if ( (mem_error = 
+	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, 
+				sizes, fstVtxSep, EMPTY,
+				Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+				p_nextl, p_nextu, p_nsuper_loc)) )
       return (mem_error);
   return 0;
 }
@@ -4251,11 +4253,11 @@ interLvl_symbfact
     /* quick return if all upper separators are dense */
     if (VInfo->filledSep != FILLED_SEPS) {
       VInfo->filledSep = FILLED_SEPS;
-      if (mem_error = 
-	  dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, 
-			      fstVtxSep,
-			      EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
-			      p_nextl, p_nextu, p_nsuper_loc))
+      if ( (mem_error = 
+	    dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, 
+				fstVtxSep,
+				EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS,
+				p_nextl, p_nextu, p_nsuper_loc)) )
 	return (mem_error);
     }
     return 0;
@@ -4774,7 +4776,11 @@ intraLvl_symbfact
 	  MPI_Irecv (&sz_msg, 1, mpi_int_t, 
 		     MPI_ANY_SOURCE, tag_intraLvl_szMsg, 
 		     (*symb_comm), &(request[0]));  
+#if defined (_LONGINT)
 	  if (sz_msg > LONG_MAX)
+#else
+	  if (sz_msg > INT_MAX)
+#endif
 	    ABORT("ERROR in intraLvl_symbfact size to send > LONG_MAX\n");
 	}
 	MPI_Waitany (2, request, index_req, status);
diff --git a/SRC/psymbfact_util.c b/SRC/psymbfact_util.c
index 40b9c864..39c066ae 100644
--- a/SRC/psymbfact_util.c
+++ b/SRC/psymbfact_util.c
@@ -292,10 +292,10 @@ int_t psymbfact_LUXpand
   if (prev_len + len_texp >= prev_xsub_nextLvl) {
     /* not enough memory */
     min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
-    if (mem_error = 
-	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
-			      mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact,
-			      VInfo, PS))
+    if ( (mem_error = 
+	  psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
+				mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact,
+				VInfo, PS)) )
       return (mem_error);
     if ( mem_type == LSUB ) 
       new_mem = Llu_symbfact->lsub;
@@ -437,10 +437,10 @@ int_t psymbfact_LUXpand_RL
   if (prev_len + len_texp >= prev_xsub_nextLvl) {
     /* not enough memory */
     min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl);
-    if (mem_error = 
-	psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
-			      mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact,
-			      VInfo, PS))
+    if ( (mem_error = 
+	  psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, 
+				mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact,
+				VInfo, PS)) )
       return (mem_error);
     if ( mem_type == LSUB ) 
       new_mem = Llu_symbfact->lsub;
diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c
new file mode 100644
index 00000000..a3bbe4d9
--- /dev/null
+++ b/SRC/pz3dcomm.c
@@ -0,0 +1,875 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_zdefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(doublecomplex), MPI_INFO_NULL, &(a)))
+
+int_t zAllocLlu(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    doublecomplex  **Lnzval_bc_ptr =
+	(doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    doublecomplex  **Unzval_br_ptr =
+	(doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* zAllocLlu */
+
+int_t zmpiMallocLUStruct(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    doublecomplex * uval = Unzval_br_ptr[lb];
+	    doublecomplex * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(doublecomplex));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    doublecomplex *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(doublecomplex));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t zzSendLPanel(int_t k, int_t receiver,
+                   zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(doublecomplex);
+		}
+	}
+    return 0;
+}
+
+
+int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
+                    doublecomplex* Lval_buf,
+                    zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, SuperLU_MPI_DOUBLE_COMPLEX, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_zscal(len2, alpha, lnzval, 1);
+		    superlu_zaxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t zzSendUPanel(int_t k, int_t receiver,
+                    zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    doublecomplex* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(doublecomplex);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
+                    doublecomplex* Uval_buf, zLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    doublecomplex* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, SuperLU_MPI_DOUBLE_COMPLEX, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_zscal(lenv, alpha, unzval, 1);
+		    superlu_zaxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	zAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	zAllocLlu(nsupers, LUstruct, grid3d);
+    
+    zLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    zscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    zscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* zp3dScatter */
+
+
+int_t zscatter3dUPanels(int_t nsupers,
+		       zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	doublecomplex * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = doublecomplexMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end zScatter3dUPanels */
+
+
+int_t zscatter3dLPanels(int_t nsupers,
+                       zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = doublecomplexCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* zscatter3dLPanels */
+
+int_t zcollect3dLpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	doublecomplex * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    zcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    zcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t zzeroSetLU(int_t nnodes, int_t* nodeList, zLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    doublecomplex* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(doublecomplex) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    doublecomplex* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(doublecomplex) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t zreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        doublecomplex* Lval_buf, doublecomplex* Uval_buf,
+                        zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    zzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    zzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        zzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		zzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t zgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        zLUValSubBuf_t* LUvsb,
+                        zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    doublecomplex alpha = {0.0, 0.0}, beta = {1.0, 0.0};
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    zzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    zzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    zzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    zzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, zLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    zzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    zreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t zgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    zLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    zgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* zgatherAllFactoredLU */
+
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
index 07404e0d..bd0f7c7c 100644
--- a/SRC/pzdistribute.c
+++ b/SRC/pzdistribute.c
@@ -12,9 +12,10 @@ at the top-level directory.
 /*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * <pre>
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * </pre>
  */
 
@@ -72,9 +73,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     int_t  SendCnt; /* number of remote nonzeros to be sent */
     int_t  RecvCnt; /* number of remote nonzeros to be sent */
     int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ia, *ja, **ia_send, *index, *itemp = NULL;
     int_t  *ptr_to_send;
-    doublecomplex *aij, **aij_send, *nzval, *dtemp;
+    doublecomplex *aij, **aij_send, *nzval, *dtemp = NULL;
     doublecomplex *nzval_a;
 	doublecomplex asum,asum_tot;
     int    iam, it, p, procs, iam_g;
@@ -140,8 +141,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
             ABORT("Malloc fails for ia[].");
         if ( !(aij = doublecomplexMalloc_dist(k)) )
             ABORT("Malloc fails for aij[].");
+        ja = ia + k;
     }
-    ja = ia + k;
 
     /* Allocate temporary storage for sending/receiving the A triplets. */
     if ( procs > 1 ) {
@@ -169,9 +170,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
 
       for (i = 0, j = 0, p = 0; p < procs; ++p) {
           if ( p != iam ) {
-	      ia_send[p] = &index[i];
+	      if (nnzToSend[p] > 0) ia_send[p] = &index[i];
 	      i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
-	      aij_send[p] = &nzval[j];
+	      if (nnzToSend[p] > 0) aij_send[p] = &nzval[j];
 	      j += nnzToSend[p];
 	  }
       }
@@ -215,7 +216,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
        NOTE: Can possibly use MPI_Alltoallv.
        ------------------------------------------------------------*/
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) {
+    	//if ( p != iam ) {
 	    it = 2*nnzToSend[p];
 	    MPI_Isend( ia_send[p], it, mpi_int_t,
 		       p, iam, grid->comm, &send_req[p] );
@@ -226,7 +228,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToRecv[p] > 0 ) {
+	//if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
 	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
@@ -245,7 +248,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang
+        //if ( p != iam ) {
 	    MPI_Wait( &send_req[p], &status);
 	    MPI_Wait( &send_req[procs+p], &status);
 	}
diff --git a/SRC/pzgsmv.c b/SRC/pzgsmv.c
index 0c0838ba..f76c3293 100644
--- a/SRC/pzgsmv.c
+++ b/SRC/pzgsmv.c
@@ -375,11 +375,11 @@ void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm)
     int_t *it;
     doublecomplex *dt;
     SUPERLU_FREE(gsmv_comm->extern_start);
-    if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it);
-    if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it);
+    if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it);
+    if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it);
     SUPERLU_FREE(gsmv_comm->ptr_ind_tosend);
     SUPERLU_FREE(gsmv_comm->SendCounts);
-    if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt);
-    if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt);
+    if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt);
+    if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt);
 }
 
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index 722fc9a9..5decae78 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -323,7 +323,7 @@ at the top-level directory.
  *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
  *                        of the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
- *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
  *                        matching to permute rows of the original matrix
  *                        to make the diagonal large relative to the
  *                        off-diagonal.
@@ -405,7 +405,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -413,7 +413,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -489,7 +489,7 @@ at the top-level directory.
  *
  * info    (output) int*
  *         = 0: successful exit
- *         < 0: if info = -i, the i-th argument had an illegal value   
+ *         < 0: if info = -i, the i-th argument had an illegal value  
  *         > 0: if info = i, and i is
  *             <= A->ncol: U(i,i) is exactly zero. The factorization has
  *                been completed, but the factor U is exactly singular,
@@ -587,13 +587,13 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
+    if ( Fact < DOFACT || Fact > FACTORED )
 	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+    else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR )
 	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+    else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC )
 	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+    else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA )
 	*info = -1;
     else if ( options->IterRefine == SLU_EXTRA ) {
 	*info = -1;
@@ -664,6 +664,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
 		break;
+	    default: break;
 	}
     }
 
@@ -913,7 +914,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
 	            }
 #endif
-                } else { /* use largeDiag_AWPM */
+                } else { /* use LargeDiag_HWPM */
 #ifdef HAVE_COMBBLAS
 		    z_c2cpp_GetHWPM(A, grid, ScalePermstruct);
 #else
@@ -1060,7 +1061,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
                    the nonzero data structures for L & U. */
 #if ( PRNTlevel>=1 )
                 if ( !iam ) {
-		    printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		    printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
 		          sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 		    fflush(stdout);
 	        }
@@ -1082,10 +1083,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		    	printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1);
 		    	printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]);
 		    	printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]);
-		    	printf("\tint %d, short %d, float %d, double %d\n",
-			       (int) sizeof(int_t), (int) sizeof(short),
-        		       (int) sizeof(float), (int) sizeof(double));
-		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+		    	printf("\tint %lu, short %lu, float %lu, double %lu\n",
+			        sizeof(int_t), sizeof(short),
+        		        sizeof(float), sizeof(double));
+		    	printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
 			   	symb_mem_usage.for_lu*1e-6,
 			   	symb_mem_usage.total*1e-6,
 			   	symb_mem_usage.expansions);
@@ -1229,11 +1230,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 
 	MPI_Comm_rank( MPI_COMM_WORLD, &iam_g );
 
-    if (!iam_g) {
-	print_options_dist(options);
-	fflush(stdout);
-    }
-
     printf(".. Ainfo mygid %5d   mysid %5d   nnz_loc " IFMT "  sum_loc  %e lsum_loc   %e nnz "IFMT " nnzLU %ld sum %e  lsum %e  N "IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum.r+asum.i, lsum.r+lsum.i, nnz_tot,nnzLU,asum_tot.r+asum_tot.i,lsum_tot.r+lsum_tot.i,A->ncol);
 	fflush(stdout);
 #endif
@@ -1324,7 +1320,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		       avg * 1e-6,
 		       avg / grid->nprow / grid->npcol * 1e-6,
 		       max * 1e-6);
-		printf("**************************************************\n");
+		printf("**************************************************\n\n");
+		printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots);
 		fflush(stdout);
             }
 	} /* end printing stats */
@@ -1572,6 +1569,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default: break;
 	}
     }
 
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
new file mode 100644
index 00000000..df80cf00
--- /dev/null
+++ b/SRC/pzgssvx3d.c
@@ -0,0 +1,1590 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PZGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) zScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) doublecomplex* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) zLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (zLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'zLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) zSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_zdefs.h for the definition of 'zSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * </pre>
+ */
+
+void
+pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
+           zScalePermstruct_t * ScalePermstruct,
+           doublecomplex B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
+           zLUstruct_t * LUstruct, zSOLVEstruct_t * SOLVEstruct,
+           double *berr, SuperLUStat_t * stat, int *info)
+{
+    NRformat_loc *Astore = A->Store;
+    SuperMatrix GA;        /* Global A in NC format */
+    NCformat *GAstore;
+    doublecomplex *a_GA;
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
+    NCPformat *GACstore;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    Glu_freeable_t *Glu_freeable;
+    /* The nonzero structures of L and U factors, which are
+       replicated on all processrs.
+       (lsub, xlsub) contains the compressed subscript of
+       supernodes in L.
+       (usub, xusub) contains the compressed subscript of
+       nonzero segments in U.
+       If options->Fact != SamePattern_SameRowPerm, they are
+       computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+       routine. They will be freed after PDDISTRIBUTE routine.
+       If options->Fact == SamePattern_SameRowPerm, these
+       structures are not used.                                  */
+    yes_no_t parSymbFact = options->ParSymbFact;
+    fact_t Fact;
+    doublecomplex *a;
+    int_t *colptr, *rowind;
+    int_t *perm_r;              /* row permutations from partial pivoting */
+    int_t *perm_c;              /* column permutation vector */
+    int_t *etree;               /* elimination tree */
+    int_t *rowptr, *colind;     /* Local A in NR */
+    int_t colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t i, iinfo, j, irow, m, n, nnz, permc_spec;
+    int_t nnz_loc, m_loc, fst_row, icol;
+    int iam;
+    int ldx;                    /* LDA for matrix X (local). */
+    char equed[1], norm[1];
+    double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+    doublecomplex *X, *b_col, *b_work, *x_col;
+    double t;
+    float GA_mem_use;           /* memory usage by global A */
+    float dist_mem_use;         /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+#if ( PRNTlevel>= 2 )
+    double dmin, dsum, dprod;
+#endif
+
+    LUstruct->dt = 'z';
+    
+    // get the 2d grid
+    gridinfo_t *grid  = &(grid3d->grid2d);
+    iam = grid->iam;
+    
+    /* Test the options choices. */
+    *info = 0;
+    Fact = options->Fact;
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA) {
+	*info = -1;
+        fprintf (stderr,
+	         "Extra precise iterative refinement yet to support.");
+    } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
+	 *info = -2;
+    else if (ldb < Astore->m_loc)
+         *info = -5;
+    else if (nrhs < 0) {
+	 *info = -6;
+    }
+    if (*info) {
+	i = -(*info);
+	pxerr_dist ("pzgssvx3d", grid, -(*info));
+	return;
+    }
+    
+    /* Initialization. */
+
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d for the following 2D solve;
+    */
+    zGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (doublecomplex *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+//////    
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter pzgssvx3d()");
+#endif
+	
+    /* Perform preprocessing steps on process layer zero, including:
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
+
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
+    {
+        m = A->nrow;
+    	n = A->ncol;
+	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
+
+	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+	A->Store = Astore0;  // on 2D grid-0
+	ldb = Astore0->m_loc;
+	
+	/* The following code now works on 2D grid-0 */
+    	Astore = (NRformat_loc *) A->Store;
+    	nnz_loc = Astore->nnz_loc;
+    	m_loc = Astore->m_loc;
+    	fst_row = Astore->fst_row;
+    	a = (doublecomplex *) Astore->nzval;
+    	rowptr = Astore->rowptr;
+    	colind = Astore->colind;
+
+        /* Structures needed for parallel symbolic factorization */
+    	int_t *sizes, *fstVtxSep;
+	int noDomains, nprocs_num;
+    	MPI_Comm symb_comm;  /* communicator for symbolic factorization */
+    	int col, key; /* parameters for creating a new communicator */
+    	Pslu_freeable_t Pslu_freeable;
+    	float flinfo;
+    
+	sizes = NULL;
+	fstVtxSep = NULL;
+	symb_comm = MPI_COMM_NULL;
+	
+	Equil = (!factored && options->Equil == YES);
+	notran = (options->Trans == NOTRANS);
+	
+	iam = grid->iam;
+	job = 5;
+	/* Extract equilibration status from a previous factorization */
+	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+	    {
+		rowequ = (ScalePermstruct->DiagScale == ROW) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+		colequ = (ScalePermstruct->DiagScale == COL) ||
+		    (ScalePermstruct->DiagScale == BOTH);
+	    }
+	else {
+	    rowequ = colequ = FALSE;
+	}
+	
+	/* The following arrays are replicated on all processes. */
+	perm_r = ScalePermstruct->perm_r;
+	perm_c = ScalePermstruct->perm_c;
+	etree = LUstruct->etree;
+	R = ScalePermstruct->R;
+	C = ScalePermstruct->C;
+	/********/
+	
+	/* Not factored & ask for equilibration */
+	if (Equil && Fact != SamePattern_SameRowPerm) {
+	    /* Allocate storage if not done so before. */
+	    switch (ScalePermstruct->DiagScale)	{
+		case NOEQUIL:
+		    if (!(R = (double  *) doubleMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->R = R;
+		    ScalePermstruct->C = C;
+		    break;
+		case ROW:
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
+			ABORT ("Malloc fails for C[].");
+		    ScalePermstruct->C = C;
+		    break;
+		case COL:
+		    if (!(R = (double *) doubleMalloc_dist (m)))
+			ABORT ("Malloc fails for R[].");
+		    ScalePermstruct->R = R;
+		    break;
+	        default: break;
+	    }
+	}
+	
+	/* ------------------------------------------------------------
+	   Diagonal scaling to equilibrate the matrix.
+	   ------------------------------------------------------------ */
+	if ( Equil ) {
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Enter equil");
+#endif
+	    t = SuperLU_timer_ ();
+		
+	    if (Fact == SamePattern_SameRowPerm) {
+		/* Reuse R and C. */
+		switch (ScalePermstruct->DiagScale) {
+		case NOEQUIL:
+		    break;
+		case ROW:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j) {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+                            zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
+			}
+			++irow;
+		    }
+		    break;
+		case COL:
+		    for (j = 0; j < m_loc; ++j)
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+			    icol = colind[i];
+                            zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
+			}
+		    break;
+		case BOTH:
+		    irow = fst_row;
+		    for (j = 0; j < m_loc; ++j)
+		    {
+			for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
+			    {
+				icol = colind[i];
+                                zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */
+                                zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */
+			    }
+			++irow;
+		    }
+		    break;
+		}
+	    } else { /* Compute R & C from scratch */
+		/* Compute the row and column scalings. */
+		pzgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+		
+		if ( iinfo > 0 ) {
+		    if ( iinfo <= m ) {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+#endif
+		    } else {
+#if ( PRNTlevel>=1 )
+			fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+#endif
+		    }
+		} else if ( iinfo < 0 ) return;
+
+		/* Now iinfo == 0 */
+
+		/* Equilibrate matrix A if it is badly-scaled.
+		   A <-- diag(R)*A*diag(C)                     */
+		pzlaqgs (A, R, C, rowcnd, colcnd, amax, equed);
+		
+		if ( strncmp(equed, "R", 1)==0 ) {
+		    ScalePermstruct->DiagScale = ROW;
+		    rowequ = ROW;
+		} else if ( strncmp(equed, "C", 1)==0 ) {
+		    ScalePermstruct->DiagScale = COL;
+		    colequ = COL;
+		} else if ( strncmp(equed, "B", 1)==0 ) {
+		    ScalePermstruct->DiagScale = BOTH;
+		    rowequ = ROW;
+		    colequ = COL;
+		} else ScalePermstruct->DiagScale = NOEQUIL;
+
+#if ( PRNTlevel>=1 )
+		if (iam==0) {
+		    printf (".. equilibrated? *equed = %c\n", *equed);
+		    fflush(stdout);
+		}
+#endif
+	    } /* end if-else Fact ... */
+
+	    stat->utime[EQUIL] = SuperLU_timer_ () - t;
+#if ( DEBUGlevel>=1 )
+	    CHECK_MALLOC (iam, "Exit equil");
+#endif
+	} /* end if Equil ... LAPACK style, not involving MC64 */
+
+	if ( !factored ) { /* Skip this if already factored. */
+	    /*
+	     * Gather A from the distributed compressed row format to
+	     * global A in compressed column format.
+	     * Numerical values are gathered only when a row permutation
+	     * for large diagonal is sought after.
+	     */
+	    if (Fact != SamePattern_SameRowPerm &&
+		(parSymbFact == NO || options->RowPerm != NO)) {
+		    
+		need_value = (options->RowPerm == LargeDiag_MC64);
+		
+		pzCompRow_loc_to_CompCol_global (need_value, A, grid, &GA);
+		
+		GAstore = (NCformat *) GA.Store;
+		colptr = GAstore->colptr;
+		rowind = GAstore->rowind;
+		nnz = GAstore->nnz;
+		GA_mem_use = (nnz + n + 1) * sizeof (int_t);
+		
+		if (need_value) {
+		    a_GA = (doublecomplex *) GAstore->nzval;
+		    GA_mem_use += nnz * sizeof (doublecomplex);
+		}
+
+		else
+		    assert (GAstore->nzval == NULL);
+	    }
+	    
+	    /* ------------------------------------------------------------
+	       Find the row permutation for A.
+	       ------------------------------------------------------------ */
+	    if (options->RowPerm != NO) {
+		t = SuperLU_timer_ ();
+		if (Fact != SamePattern_SameRowPerm) {
+		    if (options->RowPerm == MY_PERMR) {
+			/* Use user's perm_r. */
+			/* Permute the global matrix GA for symbfact() */
+			for (i = 0; i < colptr[n]; ++i) {
+			    irow = rowind[i];
+			    rowind[i] = perm_r[irow];
+			}
+		    } else if ( options->RowPerm == LargeDiag_MC64 ) {
+			/* Get a new perm_r[] */
+			if (job == 5) {
+			    /* Allocate storage for scaling factors. */
+			    if (!(R1 = doubleMalloc_dist (m)))
+				ABORT ("SUPERLU_MALLOC fails for R1[]");
+			    if (!(C1 = doubleMalloc_dist (n)))
+				ABORT ("SUPERLU_MALLOC fails for C1[]");
+			}
+			
+			if ( iam==0 ) {
+			    /* Process 0 finds a row permutation */
+			    iinfo = zldperm_dist (job, m, nnz, colptr, rowind, a_GA,
+						  perm_r, R1, C1);
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm);
+				}
+			    }
+			} else {
+			    MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm );
+			    if ( iinfo == 0 ) {
+				MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm);
+				if (job == 5 && Equil) {
+				    MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm);
+				    MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm);
+				}
+			    }
+			}
+
+			if ( iinfo && job == 5) { /* Error return */
+			    SUPERLU_FREE(R1);
+			    SUPERLU_FREE(C1);
+			}
+#if ( PRNTlevel>=2 )
+			dmin = damch_dist ("Overflow");
+			dsum = 0.0;
+			dprod = 1.0;
+#endif
+			if ( iinfo == 0 ) {
+			    if (job == 5) {
+				if ( Equil ) {
+				    for (i = 0; i < n; ++i) {
+					R1[i] = exp (R1[i]);
+					C1[i] = exp (C1[i]);
+				    }
+					
+				    /* Scale the distributed matrix further. 
+				       A <-- diag(R1)*A*diag(C1)            */
+				    irow = fst_row;
+				    for (j = 0; j < m_loc; ++j) {
+					for (i = rowptr[j]; i < rowptr[j + 1]; ++i) {
+					    icol = colind[i];
+                                            zd_mult(&a[i], &a[i], R1[irow]);
+                                            zd_mult(&a[i], &a[i], C1[icol]);
+#if ( PRNTlevel>=2 )
+					    if (perm_r[irow] == icol) {
+						/* New diagonal */
+						if (job == 2 || job == 3)
+			                          dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i]));
+						else if (job == 4)
+				                  dsum += slud_z_abs1(&a[i]);
+						else if (job == 5)
+				                  dprod *= slud_z_abs1(&a[i]);
+					    }
+#endif
+					}
+					++irow;
+				    }
+
+				    /* Multiply together the scaling factors --
+				       R/C from simple scheme, R1/C1 from MC64. */
+				    if (rowequ)
+					for (i = 0; i < m; ++i) R[i] *= R1[i];
+				    else
+					for (i = 0; i < m; ++i) R[i] = R1[i];
+				    if (colequ)
+					for (i = 0; i < n; ++i) C[i] *= C1[i];
+				    else
+					for (i = 0; i < n; ++i) C[i] = C1[i];
+
+				    ScalePermstruct->DiagScale = BOTH;
+				    rowequ = colequ = 1;
+
+				}  /* end if Equil */
+
+				/* Now permute global A to prepare for symbfact() */
+				for (j = 0; j < n; ++j) {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }
+				}
+				SUPERLU_FREE (R1);
+				SUPERLU_FREE (C1);
+			    } else { /* job = 2,3,4 */
+				for (j = 0; j < n; ++j)  {
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i)
+				    {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+				    }   /* end for i ... */
+				}       /* end for j ... */
+			    }           /* end else job ... */
+			} else { /* if iinfo != 0 */
+			    for (i = 0; i < m; ++i) perm_r[i] = i;
+			}
+#if ( PRNTlevel>=2 )
+			if (job == 2 || job == 3) {
+			    if (!iam)
+				printf ("\tsmallest diagonal %e\n", dmin);
+			} else if (job == 4) {
+			    if (!iam)
+				printf ("\tsum of diagonal %e\n", dsum);
+			} else if (job == 5) {
+			    if (!iam)
+				printf ("\t product of diagonal %e\n", dprod);
+			}
+#endif
+		    } else { /* use LargeDiag_HWPM */
+#ifdef HAVE_COMBBLAS
+		        z_c2cpp_GetHWPM(A, grid, ScalePermstruct);
+#else
+			if ( iam == 0 ) {
+			    printf("CombBLAS is not available\n"); fflush(stdout);
+			}
+#endif
+		    } /* end if-else options->RowPerm ... */
+
+		    t = SuperLU_timer_ () - t;
+		    stat->utime[ROWPERM] = t;
+#if ( PRNTlevel>=1 )
+		    if ( !iam ) {
+			printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+			fflush(stdout);
+		    }
+#endif
+		} /* end if Fact not SamePattern_SameRowPerm ... */
+	    } else { /* options->RowPerm == NOROWPERM / NATURAL */
+		for (i = 0; i < m; ++i)	perm_r[i] = i;
+	    }
+	    
+#if ( DEBUGlevel>=2 )
+	    if (!iam)
+		PrintInt10 ("perm_r", m, perm_r);
+#endif
+	} /* end if (!factored) */
+
+	if ( !factored || options->IterRefine ) {
+	    /* Compute norm(A), which will be used to adjust small diagonal. */
+	    if (notran)
+		*(unsigned char *) norm = '1';
+	    else
+		*(unsigned char *) norm = 'I';
+	    anorm = pzlangs (norm, A, grid);
+#if ( PRNTlevel>=1 )
+	    if (!iam) {
+		printf (".. anorm %e\n", anorm); fflush(stdout);
+	    }
+#endif
+	}
+	
+	
+	/* ------------------------------------------------------------
+	   Perform the LU factorization.
+	   ------------------------------------------------------------ */
+	if ( !factored ) {
+	    t = SuperLU_timer_ ();
+	    /*
+	     * Get column permutation vector perm_c[], according to permc_spec:
+	     *   permc_spec = NATURAL:  natural ordering
+	     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+	     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+	     *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+	     *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+	     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+	     */
+	    permc_spec = options->ColPerm;
+
+	    if (parSymbFact == YES || permc_spec == PARMETIS) {
+		nprocs_num = grid->nprow * grid->npcol;
+		noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num))));
+		
+		/* create a new communicator for the first noDomains
+		   processes in grid->comm */
+		key = iam;
+		if (iam < noDomains)
+		    col = 0;
+		else
+		    col = MPI_UNDEFINED;
+		MPI_Comm_split (grid->comm, col, key, &symb_comm);
+		
+		if (permc_spec == NATURAL || permc_spec == MY_PERMC) {
+		    if (permc_spec == NATURAL)
+			{
+			    for (j = 0; j < n; ++j)
+				perm_c[j] = j;
+			}
+		    if (!(sizes = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for sizes.");
+		    if (!(fstVtxSep = intMalloc_dist (2 * noDomains)))
+			ABORT ("SUPERLU_MALLOC fails for fstVtxSep.");
+		    for (i = 0; i < 2 * noDomains - 2; ++i) {
+			sizes[i] = 0;
+			fstVtxSep[i] = 0;
+		    }
+		    sizes[2 * noDomains - 2] = m;
+		    fstVtxSep[2 * noDomains - 2] = 0;
+		} else if (permc_spec != PARMETIS) {
+		    /* same as before */
+		    printf("{%4d,%4d}: pzgssvx3d: invalid ColPerm option when ParSymbfact is used\n",
+			 (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid));
+		}
+	    } /* end ... use parmetis */
+
+	    if (permc_spec != MY_PERMC && Fact == DOFACT) {
+		if (permc_spec == PARMETIS) {
+	        /* Get column permutation vector in perm_c.                   *
+		 * This routine takes as input the distributed input matrix A *
+		 * and does not modify it.  It also allocates memory for      *
+		 * sizes[] and fstVtxSep[] arrays, that contain information   *
+		 * on the separator tree computed by ParMETIS.                */
+		    flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num,
+						  noDomains, &sizes, &fstVtxSep,
+						  grid, &symb_comm);
+		    if (flinfo > 0)
+			ABORT ("ERROR in get perm_c parmetis.");
+		} else {
+		    get_perm_c_dist (iam, permc_spec, &GA, perm_c);
+		}
+	    }
+
+	    stat->utime[COLPERM] = SuperLU_timer_ () - t;
+	    
+	    /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+	       (a.k.a. column etree), depending on the choice of ColPerm.
+	       Adjust perm_c[] to be consistent with a postorder of etree.
+	       Permute columns of A to form A*Pc'. */
+	    if (Fact != SamePattern_SameRowPerm) {
+		if (parSymbFact == NO) {
+			    
+		    int_t *GACcolbeg, *GACcolend, *GACrowind;
+		    
+		    sp_colorder (options, &GA, perm_c, etree, &GAC);
+		    
+		    /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
+		    GACstore = (NCPformat *) GAC.Store;
+		    GACcolbeg = GACstore->colbeg;
+		    GACcolend = GACstore->colend;
+		    GACrowind = GACstore->rowind;
+		    for (j = 0; j < n; ++j) {
+			for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) {
+			    irow = GACrowind[i];
+			    GACrowind[i] = perm_c[irow];
+			}
+		    }
+		    
+		    /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
+		       the nonzero data structures for L & U. */
+#if ( PRNTlevel>=1 )
+		    if (!iam)
+			printf
+			    (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
+			     sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
+#endif
+		    t = SuperLU_timer_ ();
+		    if (!(Glu_freeable = (Glu_freeable_t *)
+			  SUPERLU_MALLOC (sizeof (Glu_freeable_t))))
+			ABORT ("Malloc fails for Glu_freeable.");
+
+		    /* Every process does this. */
+		    iinfo = symbfact (options, iam, &GAC, perm_c, etree,
+				      Glu_persist, Glu_freeable);
+
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (iinfo < 0) {
+			/* Successful return */
+			QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage);
+
+#if ( PRNTlevel>=1 )
+			if (!iam) {
+			    printf ("\tNo of supers %ld\n",
+				    (long) Glu_persist->supno[n - 1] + 1);
+			    printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]);
+			    printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]);
+			    printf ("\tint %lu, short %lu, float %lu, double %lu\n",
+				    sizeof(int_t), sizeof (short),
+				    sizeof(float), sizeof (double));
+			    printf
+				("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+				 symb_mem_usage.for_lu * 1e-6,
+				 symb_mem_usage.total * 1e-6,
+				 symb_mem_usage.expansions);
+			}
+#endif
+		    } else {
+			if (!iam) {
+			    fprintf (stderr, "symbfact() error returns %d\n",
+				     (int) iinfo);
+			    exit (-1);
+			}
+		    } 
+
+		} /* end serial symbolic factorization */
+		else { /* parallel symbolic factorization */
+		    t = SuperLU_timer_ ();
+		    flinfo =
+			symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r,
+				       sizes, fstVtxSep, &Pslu_freeable,
+				       &(grid->comm), &symb_comm,
+				       &symb_mem_usage);
+		    stat->utime[SYMBFAC] = SuperLU_timer_ () - t;
+		    if (flinfo > 0)
+			ABORT
+			    ("Insufficient memory for parallel symbolic factorization.");
+		}
+
+		/* Destroy GA */
+		if (parSymbFact == NO || options->RowPerm != NO)
+		    Destroy_CompCol_Matrix_dist (&GA);
+		if (parSymbFact == NO)
+		    Destroy_CompCol_Permuted_dist (&GAC);
+		
+	    } /* end if Fact not SamePattern_SameRowPerm */
+
+	    if (sizes)
+		SUPERLU_FREE (sizes);
+	    if (fstVtxSep)
+		SUPERLU_FREE (fstVtxSep);
+	    if (symb_comm != MPI_COMM_NULL)
+		MPI_Comm_free (&symb_comm);
+	    
+	    if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) {
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		t = SuperLU_timer_ ();
+		dist_mem_use = pzdistribute (Fact, n, A, ScalePermstruct,
+					     Glu_freeable, LUstruct, grid);
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+		
+		/* Deallocate storage used in symbolic factorization. */
+		if (Fact != SamePattern_SameRowPerm)
+		    {
+			iinfo = symbfact_SubFree (Glu_freeable);
+			SUPERLU_FREE (Glu_freeable);
+		    }
+	    } else {
+		/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+		   NOTE: the row permutation Pc*Pr is applied internally in the
+		   distribution routine. */
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+		    colind[j] = perm_c[colind[j]];
+		
+		t = SuperLU_timer_ ();
+		dist_mem_use = zdist_psymbtonum (Fact, n, A, ScalePermstruct,
+						 &Pslu_freeable, LUstruct, grid);
+		if (dist_mem_use > 0)
+		    ABORT ("Not enough memory available for dist_psymbtonum\n");
+
+		stat->utime[DIST] = SuperLU_timer_ () - t;
+	    }
+
+	    /*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+	} /* end if not Factored */
+    } /* end if process layer 0 */
+
+    trf3Dpartition_t*  trf3Dpartition;
+
+    /* Perform numerical factorization in parallel on all process layers.*/
+    if ( !factored ) {
+
+	/* send the data across all the layers */
+	MPI_Bcast( &m, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &n, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+	
+	/* send the LU structure to all the grids */
+	zp3dScatter(n, LUstruct, grid3d);
+
+	int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+	trf3Dpartition = zinitTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+
+	SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t));
+	SCT_init(SCT);
+	
+#if ( PRNTlevel>=1 )
+	if (grid3d->iam == 0) {
+	    printf("after 3D initialization.\n"); fflush(stdout);
+	}
+#endif
+
+	t = SuperLU_timer_ ();
+
+	/*factorize in grid 1*/
+	// if(grid3d->zscp.Iam)
+
+	pzgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+		   grid3d, stat, info);
+	stat->utime[FACT] = SuperLU_timer_ () - t;
+	
+	double tgather = SuperLU_timer_();
+	
+	zgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT);
+
+	SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+	/*print stats for bottom grid*/
+	
+#if ( PRNTlevel>=1 )
+	if (!grid3d->zscp.Iam)
+	    {
+		SCT_print(grid, SCT);
+		SCT_print3D(grid3d, SCT);
+	    }
+	SCT_printComm3D(grid3d, SCT);
+
+	/*print memory usage*/
+	z3D_printMemUse( trf3Dpartition, LUstruct, grid3d );
+
+	/*print forest weight and costs*/
+	printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
+	/*reduces stat from all the layers*/
+#endif
+
+        zDestroy_trf3Dpartition(trf3Dpartition, grid3d);
+	SCT_free(SCT);
+
+    } /* end if not Factored ... factor on all process layers */
+    
+    if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
+	if (!factored) {
+	    if (options->PrintStat) {
+		int_t TinyPivots;
+		float for_lu, total, max, avg, temp;
+		
+		zQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage);
+		
+		if (parSymbFact == TRUE) {
+		    /* The memory used in the redistribution routine
+		       includes the memory used for storing the symbolic
+		       structure and the memory allocated for numerical factorization */
+		    temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use);
+		    if (options->RowPerm != NO)
+			temp = SUPERLU_MAX (temp, GA_mem_use);
+		}
+		else {
+		    temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use,  /* symbfact step */
+					symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */
+					);
+		}
+
+		temp = SUPERLU_MAX (temp, num_mem_usage.total);
+
+		MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+			       MPI_SUM, grid->comm);
+		stat->TinyPivots = TinyPivots;
+
+		MPI_Reduce (&num_mem_usage.for_lu, &for_lu,
+			    1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&num_mem_usage.total, &total,
+				            1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+
+		if (!iam) {
+		    printf("\tNUMfact space (MB) sum(procs):  L\\U\t%.2f\tall\t%.2f\n",
+			   for_lu * 1e-6, total * 1e-6);
+		    printf ("\tTotal highmark (MB):  "
+			    "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6,
+			    avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6);
+		    printf("**************************************************\n");
+		    fflush(stdout);
+		}
+	    }
+	    
+	}   /* end if not Factored */
+
+	/* ------------------------------------------------------------
+	   Compute the solution matrix X.
+	   ------------------------------------------------------------ */
+	if ( (nrhs > 0) && (*info == 0) ) {
+	    if (!(b_work = doublecomplexMalloc_dist (n)))
+	        ABORT ("Malloc fails for b_work[]");
+
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+	        {
+	    	    if (rowequ)
+		        {
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+			        {
+			    	    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
+				    {
+                                         zd_mult(&b_col[i], &b_col[i], R[irow]);
+ 					 ++irow;
+				    }
+				    b_col += ldb;
+				}
+			}
+		    }
+		else if (colequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				{
+		                    zd_mult(&b_col[i], &b_col[i], C[irow]);
+				    ++irow;
+				}
+				b_col += ldb;
+			    }
+		    }
+
+		/* Save a copy of the right-hand side. */
+		ldx = ldb;
+		if (!(X = doublecomplexMalloc_dist (((size_t) ldx) * nrhs)))
+		    ABORT ("Malloc fails for X[]");
+		x_col = X;
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+		    x_col += ldx;
+		    b_col += ldb;
+		}
+
+		/* ------------------------------------------------------
+		   Solve the linear system.
+		   ------------------------------------------------------*/
+		if (options->SolveInitialized == NO) /* First time */
+                   /* Inside this routine, SolveInitialized is set to YES.
+	              For repeated call to pzgssvx3d(), no need to re-initialilze
+	              the Solve data & communication structures, unless a new
+	              factorization with Fact == DOFACT or SamePattern is asked for. */
+		    {
+			zSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
+			            grid, SOLVEstruct);
+		    }
+		stat->utime[SOLVE] = 0.0;
+#if 0 // Sherry: the following interface is needed by 3D trisolve.
+		pzgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
+				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#else
+		pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+			fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+#endif
+
+		/* ------------------------------------------------------------
+		   Use iterative refinement to improve the computed solution and
+		   compute error bounds and backward error estimates for it.
+		   ------------------------------------------------------------ */
+		if (options->IterRefine)
+		    {
+			/* Improve the solution by iterative refinement. */
+			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+			zSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+			t = SuperLU_timer_ ();
+			if (options->RefineInitialized == NO || Fact == DOFACT) {
+			    /* All these cases need to re-initialize gsmv structure */
+			    if (options->RefineInitialized)
+				pzgsmv_finalize (SOLVEstruct->gsmv_comm);
+			    pzgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+					 SOLVEstruct->gsmv_comm);
+
+			    /* Save a copy of the transformed local col indices
+			       in colind_gsmv[]. */
+			    if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+			    if (!(it = intMalloc_dist (nnz_loc)))
+				ABORT ("Malloc fails for colind_gsmv[]");
+			    colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+			    for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+			    options->RefineInitialized = YES;
+			}
+			else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+			    doublecomplex at;
+			    int_t k, jcol, p;
+			    /* Swap to beginning the part of A corresponding to the
+			       local part of X, as was done in pdgsmv_init() */
+			    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+				k = rowptr[i];
+				for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+				    {
+					jcol = colind[j];
+					p = SOLVEstruct->row_to_proc[jcol];
+					if (p == iam)
+					    {	/* Local */
+						at = a[k];
+						a[k] = a[j];
+						a[j] = at;
+						++k;
+					    }
+				    }
+			    }
+			    
+			    /* Re-use the local col indices of A obtained from the
+			       previous call to pdgsmv_init() */
+			    for (i = 0; i < nnz_loc; ++i)
+				colind[i] = colind_gsmv[i];
+			}
+			
+			if (nrhs == 1)
+			    {	/* Use the existing solve structure */
+				SOLVEstruct1 = SOLVEstruct;
+			    }
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
+				if (!(SOLVEstruct1 = (zSOLVEstruct_t *)
+				      SUPERLU_MALLOC(sizeof(zSOLVEstruct_t))))
+				    ABORT ("Malloc fails for SOLVEstruct1");
+				/* Copy the same stuff */
+				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+				SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+				SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+				SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+				SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+				SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+				SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				
+				/* Initialize the *gstrs_comm for 1 RHS. */
+				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+				    ABORT ("Malloc fails for gstrs_comm[]");
+				pzgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+					      Glu_persist, SOLVEstruct1);
+			    }
+			
+			pzgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
+				 B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+			
+			/* Deallocate the storage associated with SOLVEstruct1 */
+			if (nrhs > 1)
+			    {
+				pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+				SUPERLU_FREE (SOLVEstruct1);
+			    }
+			
+			stat->utime[REFINE] = SuperLU_timer_ () - t;
+		    } /* end IterRefine */
+		
+		/* Permute the solution matrix B <= Pc'*X. */
+		pzPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+#endif
+		
+		/* Transform the solution matrix X to a solution of the original
+		   system before the equilibration. */
+		if (notran)
+		    {
+			if (colequ)
+			    {
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				    {
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					    {
+						zd_mult(&b_col[i], &b_col[i], C[irow]);
+						++irow;
+					    }
+					b_col += ldb;
+				    }
+			    }
+		    }
+		else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					zd_mult(&b_col[i], &b_col[i], R[irow]);
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
+		
+		SUPERLU_FREE (b_work);
+		SUPERLU_FREE (X);
+		
+	    } /* end if nrhs > 0 and factor successful */
+	
+#if ( PRNTlevel>=1 )
+	if (!iam) {
+	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
+#endif
+	
+	/* Deallocate R and/or C if it was not used. */
+	if (Equil && Fact != SamePattern_SameRowPerm)
+	    {
+		switch (ScalePermstruct->DiagScale) {
+		    case NOEQUIL:
+			SUPERLU_FREE (R);
+			SUPERLU_FREE (C);
+			break;
+		    case ROW:
+			SUPERLU_FREE (C);
+			break;
+		    case COL:
+			SUPERLU_FREE (R);
+			break;
+	            default: break;
+		}
+	    }
+	
+#if 0
+	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+	    Destroy_CompCol_Permuted_dist (&GAC);
+#endif
+
+    } /* process layer 0 done solve */
+
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) zScatter_B3d(A3d, grid3d);
+
+    B = A3d->B3d; // B is now assigned back to B3d on return
+    A->Store = Astore3d; // restore Astore to 3D
+    
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC (iam, "Exit pzgssvx3d()");
+#endif
+
+}
diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c
index f8819583..29396604 100644
--- a/SRC/pzgssvx_ABglobal.c
+++ b/SRC/pzgssvx_ABglobal.c
@@ -587,6 +587,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
 		break;
+	    default: break;
 	}
     }
 
@@ -875,7 +876,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 	if ( Fact != SamePattern_SameRowPerm ) {
 #if ( PRNTlevel>=1 )
 	    if ( !iam )
-		printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n",
+		printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n",
 		       sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6));
 #endif
 	    t = SuperLU_timer_();
@@ -898,7 +899,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		    printf("\tint %d, short %d, float %d, double %d\n",
 			   (int) sizeof(int_t), (int) sizeof(short),
  			   (int) sizeof(float), (int) sizeof(double));
-		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n",
+		    printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
 			   symb_mem_usage.for_lu*1e-6,
 			   symb_mem_usage.total*1e-6,
 			   symb_mem_usage.expansions);
@@ -1097,6 +1098,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default:  break;
 	}
     }
     if ( !factored || (factored && options->IterRefine) )
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 174ce9ed..a9bf5dd1 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -108,7 +108,6 @@ at the top-level directory.
  */
 
 #include <math.h>
-/*#include "mkl.h"*/
 #include "superlu_zdefs.h"
 
 #ifdef GPU_ACC
@@ -127,7 +126,7 @@ at the top-level directory.
 // #define SUPERNODE_PROFILE
 
 /*
-    Name    :   BAELINE
+    Name    :   BASELINE
     Purpose : baseline to compare performance against
     Overhead : NA : this won't be used for running experiments
 */
@@ -766,11 +765,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
          SUPERLU_MAX (max_row_size * num_threads * ldt,
                       get_max_buffer_size ());           */
 
-#ifdef GPU_ACC
+#ifdef GPU_ACC /*-------- use GPU --------*/
     int gpublas_nb = get_gpublas_nb(); // default 64
-    int nstreams = get_num_gpu_streams ();
+    int nstreams = get_num_gpu_streams (); // default 8
 
-    int buffer_size  = SUPERLU_MAX(max_row_size*nstreams*gpublas_nb,get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
+                                     //   get_max_buffer_size());
     /* array holding last column blk for each partition,
        used in SchCompUdt--GPU.c         */
   #if 0
@@ -782,8 +782,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else /* not to use GPU */
 
     int Threads_per_process = get_thread_per_process();
-    int_t buffer_size  = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size());
-#endif /* end ifdef GPU_ACC */
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8));
+                                     // get_max_buffer_size());
+#endif /* end ifdef GPU_ACC -----------*/
 
     int_t max_ncols = 0;
 #if 0
@@ -810,8 +811,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigV = NULL;
 
 #if ( PRNTlevel>=1 )
-    if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
-	     		  max_row_size, max_ncols);
+    if(!iam) {
+        printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8));
+	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
+        printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
+                max_row_size, max_ncols);
+    }
     printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
     fflush(stdout);
 #endif
@@ -826,16 +831,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+            iam, bigv_size, buffer_size);
     fflush(stdout);
 #endif
     if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,gpuHostMallocDefault)) )
         ABORT("Malloc fails for zgemm buffer V");
 
-    if ( iam==0 )DisplayHeader();
-
 #if ( PRNTlevel>=1 )
-    printf(" Starting with %d GPU Streams \n",nstreams );
+    if ( iam==0 ) {
+        DisplayHeader();
+	printf(" Starting with %d GPU Streams \n",nstreams );
+        fflush(stdout);
+    }
 #endif
 
     gpublasHandle_t *handle;
@@ -878,10 +886,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         return 1;
     }
 
-    stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3)
-			  + bigu_size + buffer_size ) * dword;
+    stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA
+                                 + bigu_size                    // dB
+                                 + buffer_size );               // dC
 
-#else   /*-- not to use GPU --*/
+#else  /*-------- not to use GPU --------*/
 
     // for GEMM padding 0
     j = bigu_size / ldt;
@@ -889,7 +898,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
     fflush(stdout);
 #endif
 
@@ -903,7 +912,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
         ABORT ("Malloc failed for zgemm V buffer");
 //#endif
 
-#endif /* end ifdef GPU_ACC */
+#endif 
+/*************** end ifdef GPU_ACC ****************/
 
     log_memory((bigv_size + bigu_size) * dword, stat);
 
@@ -1756,29 +1766,29 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM,
 	       0, grid->comm);
     if ( iam==0 ) {
-	printf("\nInitialization time\t%8.2lf seconds\n"
+	printf("\nInitialization time\t%8.4lf seconds\n"
 	       "\t Serial: compute static schedule, allocate storage\n", InitTimer);
         printf("\n==== Time breakdown in factorization (rank 0) ====\n");
-	printf("Panel factorization \t %8.2lf seconds\n",
+	printf("Panel factorization \t %8.4lf seconds\n",
 	       pdgstrf2_timer + pdgstrs2_timer);
-	printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer);
-	printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer);
-	printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer);
-        printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer);
-        printf(".. Time to Gather L buffer\t %8.2lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
-        printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer);
-
-        printf(".. Time in GEMM %8.2lf \n",
+	printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer);
+	printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer);
+	printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer);
+        printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer);
+        printf(".. Time to Gather L buffer\t %8.4lf  (Separate L panel by Lookahead/Remain)\n", GatherLTimer);
+        printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer);
+
+        printf(".. Time in GEMM %8.4lf \n",
 	       LookAheadGEMMTimer + RemainGEMMTimer);
-        printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer);
-        printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n",
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer);
+        printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n",
 	       RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9);
-        printf(".. Time to Scatter %8.2lf \n",
+        printf(".. Time to Scatter %8.4lf \n",
 	       LookAheadScatterTimer + RemainScatterTimer);
-        printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer);
-        printf("\t* Remain\t %8.2lf \n", RemainScatterTimer);
+        printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer);
+        printf("\t* Remain\t %8.4lf \n", RemainScatterTimer);
 
-        printf("Total factorization time            \t: %8.2lf seconds, \n", pxgstrfTimer);
+        printf("Total factorization time            \t: %8.4lf seconds, \n", pxgstrfTimer);
         printf("--------\n");
 	printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n);
     }
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 593b21d5..46461501 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -213,8 +213,7 @@ pzgstrf2_trsm
             /* Diagonal pivot */
             i = luptr;
            if ( options->ReplaceTinyPivot == YES ) {
-                if ( slud_z_abs1(&lusup[i]) < thresh &&
-		     lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */
+                if ( slud_z_abs1(&lusup[i]) < thresh ) { /* Diagonal */
 
 #if ( PRNTlevel>=2 )
                     printf ("(%d) .. col %d, tiny pivot %e  ",
@@ -363,44 +362,35 @@ pzgstrf2_trsm
 }  /* PZGSTRF2_trsm */
 
 
-#if 0 /* COMMENT OUT 3D CODE FOR NOW */
 
 /*****************************************************************************
  * The following functions are for the new pdgstrf2_ztrsm in the 3D code.
  *****************************************************************************/
 static
-int_t LpanelUpdate(int_t off0,  int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow,
-                   doublecomplex* lusup, int_t nsupr, SCT_t* SCT)
+int_t LpanelUpdate(int off0,  int nsupc, doublecomplex* ublk_ptr, int ld_ujrow,
+                   doublecomplex* lusup, int nsupr, SCT_t* SCT)
 {
     int_t l = nsupr - off0;
     doublecomplex alpha = {1.0, 0.0};
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
 
 #define GT  32
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < CEILING(l, GT); ++i)
     {
         int_t off = i * GT;
-        int_t len = SUPERLU_MIN(GT, l - i * GT);
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-        ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
-		1, 1, 1, 1);
-  #else
-        ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
-  #endif
-#else
-        cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                     len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
-#endif
+        int len = SUPERLU_MIN(GT, l - i * GT);
+	
+        superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+		      ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
 
     } /* for i = ... */
 
-    t1 = _rdtsc() - t1;
+    t1 = SuperLU_timer_() - t1;
 
-    SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc;
+    SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc;
     SCT->trf2_time += t1;
     SCT->L_PanelUpdate_tl += t1;
     return 0;
@@ -408,13 +398,30 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
-/*factorizes the diagonal block; called from process that owns the (k,k) block*/
+/************************************************************************/
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    doublecomplex *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, zLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     doublecomplex alpha = {-1.0, 0.0}, zero = {0.0, 0.0}, one = {1.0, 0.0};
 
@@ -424,8 +431,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     int_t jfst = FstBlockC (k);
     int_t jlst = FstBlockC (k + 1);
     doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
-    int_t nsupc = SuperSize (k);
-    int_t nsupr;
+    int nsupc = SuperSize (k);
+    int nsupr;
     if (Llu->Lrowind_bc_ptr[lk])
         nsupr = Llu->Lrowind_bc_ptr[lk][1];
     else
@@ -433,11 +440,11 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     doublecomplex *ublk_ptr = BlockUFactor;
     doublecomplex *ujrow = BlockUFactor;
     int_t luptr = 0;                  /* Point_t to the diagonal entries. */
-    int_t cols_left = nsupc;          /* supernode size */
+    int cols_left = nsupc;          /* supernode size */
     int_t u_diag_cnt = 0;
     int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
-    int_t incx = 1;
-    int_t incy = ld_ujrow;
+    int incx = 1;
+    int incy = ld_ujrow;
 
     for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
     {
@@ -486,18 +493,11 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
         if (--cols_left)
         {
             /*following must be int*/
-            int_t l = nsupc - j - 1;
+            int l = nsupc - j - 1;
 
 	    /* Rank-1 update */
-#if 1
-	    zgeru_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
-		    &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
-		    &nsupr);
-#else
-            cblas_zgeru (CblasColMajor, l, cols_left, &alpha, &lusup[luptr + 1], incx,
-                        &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
-                        nsupr);
-#endif
+            superlu_zger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
             stat->ops[FACT] += 8 * l * cols_left;
         }
 
@@ -508,8 +508,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
 
     //int_t thread_id = omp_get_thread_num();
-    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1);
-}
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Zgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
@@ -714,7 +714,7 @@ int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 
 int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 			      int_t *usub, doublecomplex *uval, doublecomplex *tempv,
-			      int_t knsupc, int_t nsupr, doublecomplex *lusup,
+			      int_t knsupc, int nsupr, doublecomplex *lusup,
 			      Glu_persist_t *Glu_persist)    /*glupersist for xsup for supersize*/
 {
     doublecomplex alpha = {1.0, 0.0};
@@ -727,34 +727,22 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 
     // printf("klst inside task%d\n", );
     /*find ldu */
-    int_t ldu = 0;
+    int ldu = 0;
     for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
     {
         ldu = SUPERLU_MAX( klst - usub[jj], ldu) ;
     }
 
     /*pack U block into a dense Block*/
-    int_t ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
+    int ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
     	                           uval, tempv);
 
     /*now call ztrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
-
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu,
-	     1, 1, 1, 1);
-  #else
-     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu);
-  #endif
-#else
-
-    cblas_ztrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
-                 ldu, ncols, (void*) &alpha, &lusup[luptr], nsupr, tempv, ldu);
-#endif
+    
+    superlu_ztrsm("L", "L", "N", "U", ldu, ncols, alpha,
+		  &lusup[luptr], nsupr, tempv, ldu);
 
     /*now scatter the output into sparse U block*/
     zTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
@@ -762,10 +750,9 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 }
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
-#if 1 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -847,8 +834,10 @@ void pzgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -891,7 +880,9 @@ void pzgstrs2_omp
 #endif
 	    } /* end if segsize > 0 */
 	} /* end for j in parallel ... */
+#ifdef _OPENMP    
 /* #pragma omp taskwait */
+#endif
     }  /* end for b ... */
 
 #ifndef USE_Ublock_info
@@ -914,7 +905,7 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, zLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, doublecomplex *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -935,20 +926,22 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
         zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 } /* pdgstrs2_omp new version from Piyush */
 
-#endif
+#endif /* there are 2 versions of pzgstrs2_omp */
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
new file mode 100644
index 00000000..e3bd65c7
--- /dev/null
+++ b/SRC/pzgstrf3d.c
@@ -0,0 +1,391 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "zlustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PZGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) zLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) zLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_zdefs.h for the definition of 'zLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * </pre>
+ */
+int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
+		trf3Dpartition_t*  trf3Dpartition, SCT_t *SCT,
+		zLUstruct_t *LUstruct, gridinfo3d_t * grid3d,
+		SuperLUStat_t *stat, int *info)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    // problem specific contants
+    int_t ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
+    //    double s_eps = slamch_ ("Epsilon");  -Sherry
+    double s_eps = smach_dist("Epsilon");
+    double thresh = s_eps * anorm;
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter pzgstrf3d()");
+#endif
+
+    // Initilize stat
+    stat->ops[FACT] = 0;
+    stat->current_buffer = 0.0;
+    stat->peak_buffer    = 0.0;
+    stat->gpu_buffer     = 0.0;
+    //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
+
+    //getting Nsupers
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+
+    // Grid related Variables
+    int_t iam = grid->iam; // in 2D grid
+    int num_threads = getNumThreads(grid3d->iam);
+
+    factStat_t factStat;
+    initFactStat(nsupers, &factStat);
+
+#if 0  // sherry: not used
+    zdiagFactBufs_t dFBuf;
+    zinitDiagFactBufs(ldt, &dFBuf);
+
+    commRequests_t comReqs;   
+    initCommRequests(&comReqs, grid);
+
+    msgs_t msgs;
+    initMsgs(&msgs);
+#endif
+
+    SCT->tStartup = SuperLU_timer_();
+    packLUInfo_t packLUInfo;
+    initPackLUInfo(nsupers, &packLUInfo);
+
+    zscuBufs_t scuBufs;
+    zinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid);
+
+    factNodelists_t  fNlists;
+    initFactNodelists( ldt, num_threads, nsupers, &fNlists);
+
+    // tag_ub initialization
+    int tag_ub = set_tag_ub();
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+#if ( PRNTlevel>=1 )
+    if (grid3d->iam == 0) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
+    // trf3Dpartition_t*  trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d);
+    gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
+    int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno;
+    int_t* myNodeCount = trf3Dpartition->myNodeCount;
+    int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    int_t** treePerm = trf3Dpartition->treePerm ;
+    zLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+
+    /* Initializing factorization specific buffers */
+
+    int_t numLA = getNumLookAhead(options);
+    zLUValSubBuf_t** LUvsbs = zLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
+    msgs_t**msgss = initMsgsArr(numLA);
+    int_t mxLeafNode    = 0;
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
+        if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
+            mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
+    }
+    zdiagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid);
+    commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid);
+
+    /* Setting up GPU related data structures */
+
+    int_t first_l_block_acc = 0;
+    int_t first_u_block_acc = 0;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
+
+    zInit_HyP(HyP, Llu, mcb, mrb);
+    HyP->first_l_block_acc = first_l_block_acc;
+    HyP->first_u_block_acc = first_u_block_acc;
+
+    int superlu_acc_offload = HyP->superlu_acc_offload;
+
+    //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t bigu_size = getBigUSize(nsupers, grid,
+    	  	                  LUstruct->Llu->Lrowind_bc_ptr);
+    HyP->bigu_size = bigu_size;
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
+    HyP->buffer_size = buffer_size;
+    HyP->nsupers = nsupers;
+
+#ifdef GPU_ACC
+
+    /*Now initialize the GPU data structure*/
+    zLUstruct_gpu_t *A_gpu, *dA_gpu;
+
+    d2Hreduce_t d2HredObj;
+    d2Hreduce_t* d2Hred = &d2HredObj;
+    zsluGPU_t sluGPUobj;
+    zsluGPU_t *sluGPU = &sluGPUobj;
+    sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
+    if (superlu_acc_offload)
+    {
+#if 0 	/* Sherry: For GPU code on titan, we do not need performance 
+	   lookup tables since due to difference in CPU-GPU performance,
+	   it didn't make much sense to do any Schur-complement update
+	   on CPU, except for the lookahead-update on CPU. Same should
+	   hold for summit as well. (from Piyush)   */
+
+        /*Initilize the lookup tables */
+        LookUpTableInit(iam);
+        acc_async_cost = get_acc_async_cost();
+#ifdef GPU_DEBUG
+        if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost);
+#endif
+#endif
+
+	//OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+	int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+					      LUstruct->etree,
+					      LUstruct->Glu_persist,
+					      LUstruct->Llu->Lrowind_bc_ptr,
+					      LUstruct->Llu->Ufstnz_br_ptr,
+					      grid);
+
+	/* Initialize GPU data structures */
+        zinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno,
+                        n, buffer_size, bigu_size, ldt);
+
+        HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu;
+        HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu;
+        HyP->nGPUStreams = sluGPU->nGPUStreams;
+    }
+
+#endif  // end GPU_ACC
+
+    /*====  starting main factorization loop =====*/
+    MPI_Barrier( grid3d->comm);
+    SCT->tStartup = SuperLU_timer_() - SCT->tStartup;
+    // int_t myGrid = grid3d->zscp.Iam;
+
+#ifdef ITAC_PROF
+    VT_traceon();
+#endif
+#ifdef MAP_PROFILE
+    allinea_start_sampling();
+#endif
+    SCT->pdgstrfTimer = SuperLU_timer_();
+
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
+    {
+        /* if I participate in this level */
+        if (!myZeroTrIdxs[ilvl])
+        {
+            //int_t tree = myTreeIdxs[ilvl];
+
+            sForest_t* sforest = sForests[myTreeIdxs[ilvl]];
+
+            /* main loop over all the supernodes */
+            if (sforest) /* 2D factorization at individual subtree */
+            {
+                double tilvl = SuperLU_timer_();
+#ifdef GPU_ACC
+                zsparseTreeFactor_ASYNC_GPU(
+                    sforest,
+                    comReqss, &scuBufs,  &packLUInfo,
+                    msgss, LUvsbs, dFBufs,  &factStat, &fNlists,
+                    &gEtreeInfo, options,  iperm_c_supno, ldt,
+                    sluGPU,  d2Hred,  HyP, LUstruct, grid3d, stat,
+                    thresh,  SCT, tag_ub, info);
+#else
+                zsparseTreeFactor_ASYNC(sforest, comReqss,  &scuBufs, &packLUInfo,
+					msgss, LUvsbs, dFBufs, &factStat, &fNlists,
+					&gEtreeInfo, options, iperm_c_supno, ldt,
+					HyP, LUstruct, grid3d, stat,
+					thresh,  SCT, tag_ub, info );
+#endif
+
+                /*now reduce the updates*/
+                SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
+                sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
+            }
+
+            if (ilvl < maxLvl - 1)     /*then reduce before factorization*/
+            {
+#ifdef GPU_ACC
+                zreduceAllAncestors3d_GPU(
+                    ilvl, myNodeCount, treePerm, LUvsb,
+                    LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP,
+                    SCT );
+#else
+
+                zreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+                                      LUvsb, LUstruct, grid3d, SCT );
+#endif
+
+            }
+        } /*if (!myZeroTrIdxs[ilvl])  ... If I participate in this level*/
+
+        SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
+	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
+
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    if (superlu_acc_offload) {
+         zfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
+#endif
+    
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
+    SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
+
+#ifdef ITAC_PROF
+    VT_traceoff();
+#endif
+
+#ifdef MAP_PROFILE
+    allinea_stop_sampling();
+#endif
+
+    reduceStat(FACT, stat, grid3d);
+
+    // sherry added
+    /* Deallocate factorization specific buffers */
+    freePackLUInfo(&packLUInfo);
+    zfreeScuBufs(&scuBufs);
+    freeFactStat(&factStat);
+    freeFactNodelists(&fNlists);
+    freeMsgsArr(numLA, msgss);
+    freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss);
+    zLluBufFreeArr(numLA, LUvsbs);
+    zfreeDiagFactBufsArr(mxLeafNode, dFBufs);
+    Free_HyP(HyP);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit pzgstrf3d()");
+#endif
+    return 0;
+
+} /* pzgstrf3d */
diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c
index 1e99e227..051d6d2f 100644
--- a/SRC/pzgstrs.c
+++ b/SRC/pzgstrs.c
@@ -2397,6 +2397,8 @@ for (i=0;i<nroot_send;i++){
 
 		log_memory(-nlb*aln_i*iword-nlb*iword - nsupers_i*iword - (CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword - maxrecvsz*(nbrecvx+1)*dword*2.0 - sizelsum*num_thread * dword*2.0 - (ldalsum * nrhs + nlb * XK_H) *dword*2.0 - (sizertemp*num_thread + 1)*dword*2.0, stat);	//account for bmod, brecv, root_send, rootsups, recvbuf_BC_fwd,rtemp,lsum,x
 
+		//		if (!iam) { printf("DBG: pzgstrs: after free memory\n"); fflush(stdout);}
+		
 		for (lk=0;lk<nsupers_j;++lk){
 			if(UBtree_ptr[lk].empty_==NO){
 				// if(BcTree_IsRoot(LBtree_ptr[lk],'d')==YES){
@@ -2414,7 +2416,7 @@ for (i=0;i<nroot_send;i++){
 		}
 		MPI_Barrier( grid->comm );
 
-
+		//		if (!iam) { printf("DBG: pzgstrs: after Barrier\n"); fflush(stdout);}
 #if ( PROFlevel>=2 )
 		{
 			float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c
index 5fa8db11..4e2e52a9 100644
--- a/SRC/pzgstrs_lsum.c
+++ b/SRC/pzgstrs_lsum.c
@@ -13,14 +13,15 @@ at the top-level directory.
  * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * </pre>
  */
 
@@ -539,11 +540,11 @@ void zlsum_fmod_inv
 			for (nn=0;nn<Nchunk;++nn){
 
 #ifdef _OPENMP
-				thread_id1 = omp_get_thread_num ();
+			    thread_id1 = omp_get_thread_num ();
 #else
-				thread_id1 = 0;
+			    thread_id1 = 0;
 #endif
-				rtemp_loc = &rtemp[sizertemp* thread_id1];
+			    rtemp_loc = &rtemp[sizertemp* thread_id1];
 
 				if(nn<remainder){
 					lbstart = nn*(nlb_loc+1);
@@ -565,45 +566,45 @@ void zlsum_fmod_inv
 						nbrow += lsub[lptr1_tmp+1];
 					}
 
-				#ifdef _CRAY
+#ifdef _CRAY
 					CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc,
 						  &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						  &knsupc, &beta, rtemp_loc, &nbrow );
-				#elif defined (USE_VENDOR_BLAS)
+#elif defined (USE_VENDOR_BLAS)
 					zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
 						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						   &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 );
-				#else
+#else
 					zgemm_( "N", "N", &nbrow, &nrhs, &knsupc,
 						   &alpha, &lusup[luptr_tmp1], &nsupr, xk,
 						   &knsupc, &beta, rtemp_loc, &nbrow );
-				#endif
+#endif
 
 					nbrow_ref=0;
 					for (lb = lbstart; lb < lbend; ++lb){
-						lptr1_tmp = lloc[lb+idx_i];
-						lptr= lptr1_tmp+2;
-						nbrow1 = lsub[lptr1_tmp+1];
-						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-						rel = xsup[ik]; /* Global row index of block ik. */
+					    lptr1_tmp = lloc[lb+idx_i];
+					    lptr= lptr1_tmp+2;
+					    nbrow1 = lsub[lptr1_tmp+1];
+					    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+					    rel = xsup[ik]; /* Global row index of block ik. */
 
-						lk = LBi( ik, grid ); /* Local block number, row-wise. */
+					    lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-						iknsupc = SuperSize( ik );
-						il = LSUM_BLK( lk );
+					    iknsupc = SuperSize( ik );
+					    il = LSUM_BLK( lk );
 
-						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i = 0; i < nbrow1; ++i) {
-								irow = lsub[lptr+i] - rel; /* Relative row. */
-								z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id1],
-									  &lsum[il+irow + j*iknsupc+sizelsum*thread_id1],
-									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
-							}
+					    RHS_ITERATE(j)
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+						for (i = 0; i < nbrow1; ++i) {
+					   	    irow = lsub[lptr+i] - rel; /* Relative row. */
+						    z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id1],
+							  &lsum[il+irow + j*iknsupc+sizelsum*thread_id1],
+							  &rtemp_loc[nbrow_ref+i + j*nbrow]);
+						}
 						nbrow_ref+=nbrow1;
-					}
+					} /* endd for lb ... */
 
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
@@ -611,156 +612,155 @@ void zlsum_fmod_inv
 #endif
 
 					for (lb=lbstart;lb<lbend;lb++){
-						lk = lloc[lb+idx_n];
+					    lk = lloc[lb+idx_n];
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-						fmod_tmp=--fmod[lk*aln_i];
+					    fmod_tmp=--fmod[lk*aln_i];
 
-						if ( fmod_tmp==0 ) { /* Local accumulation done. */
+					    if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-							lptr1_tmp = lloc[lb+idx_i];
+						lptr1_tmp = lloc[lb+idx_i];
 
-							ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-							lk = LBi( ik, grid ); /* Local block number, row-wise. */
+						ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+						lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-							iknsupc = SuperSize( ik );
-							il = LSUM_BLK( lk );
+						iknsupc = SuperSize( ik );
+						il = LSUM_BLK( lk );
 
-							ikcol = PCOL( ik, grid );
-							p = PNUM( myrow, ikcol, grid );
-							if ( iam != p ) {
-								for (ii=1;ii<num_thread;ii++)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (jj=0;jj<iknsupc*nrhs;jj++)
-										z_add(&lsum[il + jj ],
-											  &lsum[il + jj ],
-											  &lsum[il + jj + ii*sizelsum]);
+						ikcol = PCOL( ik, grid );
+						p = PNUM( myrow, ikcol, grid );
+						if ( iam != p ) {
+						    for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							for (jj=0;jj<iknsupc*nrhs;jj++)
+								z_add(&lsum[il + jj ],
+									  &lsum[il + jj ],
+									  &lsum[il + jj + ii*sizelsum]);
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-								nleaf_send_tmp = ++nleaf_send[0];
-								leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
-								// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'z');
+							nleaf_send_tmp = ++nleaf_send[0];
+							leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
+							// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'z');
 
-							} else { /* Diagonal process: X[i] += lsum[i]. */
+						} else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
-								TIC(t1);
-#endif
-								for (ii=1;ii<num_thread;ii++)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (jj=0;jj<iknsupc*nrhs;jj++)
-										z_add(&lsum[il + jj ],
-											  &lsum[il + jj ],
-											  &lsum[il + jj + ii*sizelsum]);
-
-								ii = X_BLK( lk );
-								RHS_ITERATE(j)
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (i = 0; i < iknsupc; ++i)
-										z_add(&x[i + ii + j*iknsupc],
-											  &x[i + ii + j*iknsupc],
-											  &lsum[i + il + j*iknsupc] );
-
-
-								// fmod[lk] = -1; /* Do not solve X[k] in the future. */
-								lk = LBj( ik, grid );/* Local block number, column-wise. */
-								lsub1 = Llu->Lrowind_bc_ptr[lk];
-								lusup1 = Llu->Lnzval_bc_ptr[lk];
-								nsupr1 = lsub1[1];
-
-								if(Llu->inv == 1){
-									Linv = Llu->Linv_bc_ptr[lk];
+							TIC(t1);
+#endif
+							for (ii=1;ii<num_thread;ii++)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (jj=0;jj<iknsupc*nrhs;jj++)
+								z_add(&lsum[il + jj ],
+									  &lsum[il + jj ],
+									  &lsum[il + jj + ii*sizelsum]);
+
+							ii = X_BLK( lk );
+							RHS_ITERATE(j)
+						#ifdef _OPENMP
+						#pragma omp simd
+						#endif
+							    for (i = 0; i < iknsupc; ++i)
+								z_add(&x[i + ii + j*iknsupc],
+									  &x[i + ii + j*iknsupc],
+									  &lsum[i + il + j*iknsupc] );
+
+							// fmod[lk] = -1; /* Do not solve X[k] in the future. */
+							lk = LBj( ik, grid );/* Local block number, column-wise. */
+							lsub1 = Llu->Lrowind_bc_ptr[lk];
+							lusup1 = Llu->Lnzval_bc_ptr[lk];
+							nsupr1 = lsub1[1];
+
+							if(Llu->inv == 1){
+								Linv = Llu->Linv_bc_ptr[lk];
 
 
 #ifdef _CRAY
-									CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc );
+								CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
 #elif defined (USE_VENDOR_BLAS)
-									zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+								zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
 #else
-									zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-											&alpha, Linv, &iknsupc, &x[ii],
-											&iknsupc, &beta, rtemp_loc, &iknsupc );
-#endif
-									#ifdef _OPENMP
-									#pragma omp simd
-									#endif
-									for (i=0 ; i<iknsupc*nrhs ; i++){
-										z_copy(&x[ii+i],&rtemp_loc[i]);
-									}
-
-								}else{
+								zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+										&alpha, Linv, &iknsupc, &x[ii],
+										&iknsupc, &beta, rtemp_loc, &iknsupc );
+#endif
+							#ifdef _OPENMP
+							#pragma omp simd
+							#endif
+								for (i=0 ; i<iknsupc*nrhs ; i++){
+									z_copy(&x[ii+i],&rtemp_loc[i]);
+								}
+
+							}else{
 #ifdef _CRAY
-									CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+								CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 											lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-											lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+								ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-									ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-											lusup1, &nsupr1, &x[ii], &iknsupc);
+								ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+										lusup1, &nsupr1, &x[ii], &iknsupc);
 
 #endif
-								}
+							} /* end else */
 
 #if ( PROFlevel>=1 )
-								TOC(t2, t1);
-								stat[thread_id1]->utime[SOL_TRSM] += t2;
+							TOC(t2, t1);
+							stat[thread_id1]->utime[SOL_TRSM] += t2;
 
 #endif
 
-								stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
+							stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 								+ 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
-								printf("(%2d) Solve X[%2d]\n", iam, ik);
+							printf("(%2d) Solve X[%2d]\n", iam, ik);
 
 #endif
 
-								/*
-								 * Send Xk to process column Pc[k].
-								 */
+							/*
+							 * Send Xk to process column Pc[k].
+							 */
 
 								if(LBtree_ptr[lk].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-									nleaf_send_tmp = ++nleaf_send[0];
-									leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
-								}
+								nleaf_send_tmp = ++nleaf_send[0];
+								leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+							}
 
-								/*
-								 * Perform local block modifications.
-								 */
+							/*
+							 * Perform local block modifications.
+							 */
 
-								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
-								// #endif
-								{
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
+							{
 
-									zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
-											fmod, xsup,
-											grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
-								}
+								zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+										fmod, xsup,
+										grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread);
+							}
 
-								// } /* if frecv[lk] == 0 */
-						} /* if iam == p */
+							// } /* if frecv[lk] == 0 */
+						} /* end if iam == p */
 					} /* if fmod[lk] == 0 */
 				}
 
-			}
+			} /* end tasklook for nn ... */
 		}
 
 		}else{
@@ -802,18 +802,18 @@ void zlsum_fmod_inv
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
-					for (i = 0; i < nbrow1; ++i) {
-						irow = lsub[lptr+i] - rel; /* Relative row. */
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
+				    for (i = 0; i < nbrow1; ++i) {
+					irow = lsub[lptr+i] - rel; /* Relative row. */
 
-								z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id],
-									  &lsum[il+irow + j*iknsupc+sizelsum*thread_id],
-									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
-					}
+					z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id],
+						  &lsum[il+irow + j*iknsupc+sizelsum*thread_id],
+						  &rtemp_loc[nbrow_ref+i + j*nbrow]);
+				    }
 				nbrow_ref+=nbrow1;
-			}
+			} /* end for lb ... */
 
 			// TOC(t3, t1);
 
@@ -824,147 +824,143 @@ void zlsum_fmod_inv
 
 			for (lb=0;lb<nlb;lb++){
 				lk = lloc[lb+idx_n];
-
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
 				fmod_tmp=--fmod[lk*aln_i];
 
-
 				if ( fmod_tmp==0 ) { /* Local accumulation done. */
 
-					lptr1_tmp = lloc[lb+idx_i];
+				    lptr1_tmp = lloc[lb+idx_i];
 
-					ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
-					lk = LBi( ik, grid ); /* Local block number, row-wise. */
+				    ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */
+				    lk = LBi( ik, grid ); /* Local block number, row-wise. */
 
-					iknsupc = SuperSize( ik );
-					il = LSUM_BLK( lk );
-					ikcol = PCOL( ik, grid );
-					p = PNUM( myrow, ikcol, grid );
-					if ( iam != p ) {
-						for (ii=1;ii<num_thread;ii++)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (jj=0;jj<iknsupc*nrhs;jj++)
-								z_add(&lsum[il + jj ],
-									  &lsum[il + jj ],
-									  &lsum[il + jj + ii*sizelsum]);
+				    iknsupc = SuperSize( ik );
+				    il = LSUM_BLK( lk );
+				    ikcol = PCOL( ik, grid );
+				    p = PNUM( myrow, ikcol, grid );
+				    if ( iam != p ) {
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				    	    for (jj=0;jj<iknsupc*nrhs;jj++)
+						z_add(&lsum[il + jj ],
+							  &lsum[il + jj ],
+							  &lsum[il + jj + ii*sizelsum]);
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-						nleaf_send_tmp = ++nleaf_send[0];
+					nleaf_send_tmp = ++nleaf_send[0];
 						leaf_send[(nleaf_send_tmp-1)*aln_i] = -lk-1;
 
-					} else { /* Diagonal process: X[i] += lsum[i]. */
+				    } else { /* Diagonal process: X[i] += lsum[i]. */
 
 #if ( PROFlevel>=1 )
-						TIC(t1);
+					TIC(t1);
 #endif
-						for (ii=1;ii<num_thread;ii++)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (jj=0;jj<iknsupc*nrhs;jj++)
-								z_add(&lsum[il + jj ],
-									  &lsum[il + jj ],
-									  &lsum[il + jj + ii*sizelsum]);
-
-						ii = X_BLK( lk );
-						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i = 0; i < iknsupc; ++i)
-								z_add(&x[i + ii + j*iknsupc],
-									  &x[i + ii + j*iknsupc],
-									  &lsum[i + il + j*iknsupc] );
+					for (ii=1;ii<num_thread;ii++)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+					    for (jj=0;jj<iknsupc*nrhs;jj++)
+						z_add(&lsum[il + jj ],
+							  &lsum[il + jj ],
+							  &lsum[il + jj + ii*sizelsum]);
 
+					ii = X_BLK( lk );
+					RHS_ITERATE(j)
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
+				 	    for (i = 0; i < iknsupc; ++i)
+						z_add(&x[i + ii + j*iknsupc],
+							  &x[i + ii + j*iknsupc],
+							  &lsum[i + il + j*iknsupc] );
 
-						lk = LBj( ik, grid );/* Local block number, column-wise. */
-						lsub1 = Llu->Lrowind_bc_ptr[lk];
-						lusup1 = Llu->Lnzval_bc_ptr[lk];
-						nsupr1 = lsub1[1];
+					lk = LBj( ik, grid );/* Local block number, column-wise. */
+					lsub1 = Llu->Lrowind_bc_ptr[lk];
+					lusup1 = Llu->Lnzval_bc_ptr[lk];
+					nsupr1 = lsub1[1];
 
-						if(Llu->inv == 1){
-							Linv = Llu->Linv_bc_ptr[lk];
+					if(Llu->inv == 1){
+					    Linv = Llu->Linv_bc_ptr[lk];
 #ifdef _CRAY
-							CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc );
+					    CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
 #elif defined (USE_VENDOR_BLAS)
-							zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
+					    zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 );
 #else
-							zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
-									&alpha, Linv, &iknsupc, &x[ii],
-									&iknsupc, &beta, rtemp_loc, &iknsupc );
+					    zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc,
+							&alpha, Linv, &iknsupc, &x[ii],
+							&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
-							for (i=0 ; i<iknsupc*nrhs ; i++){
-								z_copy(&x[ii+i],&rtemp_loc[i]);
-							}
-						}else{
+					#ifdef _OPENMP
+					#pragma omp simd
+					#endif
+					    for (i=0 ; i<iknsupc*nrhs ; i++){
+						z_copy(&x[ii+i],&rtemp_loc[i]);
+					    }
+					}else{
 #ifdef _CRAY
-							CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
-									lusup1, &nsupr1, &x[ii], &iknsupc);
+					    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc);
 #elif defined (USE_VENDOR_BLAS)
-							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
-									lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
+					    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+							lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1);
 #else
-							ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
+					    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
-						}
+					} /* end else */
 
 #if ( PROFlevel>=1 )
-						TOC(t2, t1);
-						stat[thread_id]->utime[SOL_TRSM] += t2;
+					TOC(t2, t1);
+					stat[thread_id]->utime[SOL_TRSM] += t2;
 #endif
 
-						stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
+					stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 						+ 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
-						printf("(%2d) Solve X[%2d]\n", iam, ik);
+					printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
 
 						/*
 						 * Send Xk to process column Pc[k].
 						 */
 
-						if(LBtree_ptr[lk].empty_==NO){
+					if(LBtree_ptr[lk].empty_==NO){
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-							nleaf_send_tmp = ++nleaf_send[0];
-							// printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp);
-							leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
-							// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
-						}
-
-						/*
-						 * Perform local block modifications.
-						 */
+						nleaf_send_tmp = ++nleaf_send[0];
+						// printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp);
+						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
+					}
 
-						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
-						// #endif
+					/*
+					 * Perform local block modifications.
+					 */
 
-						{
-							zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
-									fmod, xsup,
-									grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
-						}
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1)
+// #endif
+					{
+						zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik,
+							fmod, xsup,
+							grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
+					}
 
 						// } /* if frecv[lk] == 0 */
-				} /* if iam == p */
+				} /* end else iam == p */
 			} /* if fmod[lk] == 0 */
 		}
 		// }
@@ -973,7 +969,6 @@ void zlsum_fmod_inv
 	stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc;
 
 
-
 } /* if nlb>0*/
 } /* zLSUM_FMOD_INV */
 
@@ -1157,9 +1152,9 @@ void zlsum_fmod_inv_master
 						il = LSUM_BLK( lk );
 
 						RHS_ITERATE(j)
-							#ifdef _OPENMP
-								#pragma omp simd lastprivate(irow)
-							#endif
+					#ifdef _OPENMP
+					#pragma omp simd lastprivate(irow)
+					#endif
 							for (i = 0; i < nbrow1; ++i) {
 								irow = lsub[lptr+i] - rel; /* Relative row. */
 								z_sub(&lsum[il+irow + j*iknsupc],
@@ -1167,14 +1162,15 @@ void zlsum_fmod_inv_master
 									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
 							}
 						nbrow_ref+=nbrow1;
-					}
+					} /* end for lb ... */
 
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id1]->utime[SOL_GEMM] += t2;
 #endif
-			}
-		}
+			} /* end if (lbstart<lbend) ... */
+
+		} /* end taskloop for nn = ... */
 
 		}else{
 
@@ -1215,24 +1211,24 @@ void zlsum_fmod_inv_master
 				il = LSUM_BLK( lk );
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-						#pragma omp simd lastprivate(irow)
-					#endif
+			#ifdef _OPENMP
+			#pragma omp simd lastprivate(irow)
+			#endif
 					for (i = 0; i < nbrow1; ++i) {
 						irow = lsub[lptr+i] - rel; /* Relative row. */
 
-								z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id],
-									  &lsum[il+irow + j*iknsupc+sizelsum*thread_id],
-									  &rtemp_loc[nbrow_ref+i + j*nbrow]);
+						z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id],
+							  &lsum[il+irow + j*iknsupc+sizelsum*thread_id],
+							  &rtemp_loc[nbrow_ref+i + j*nbrow]);
 					}
 				nbrow_ref+=nbrow1;
-			}
+			} /* end for lb ... */
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat[thread_id]->utime[SOL_GEMM] += t2;
 #endif
-		}
-			// TOC(t3, t1);
+		} /* end else ... */
+		// TOC(t3, t1);
 		rtemp_loc = &rtemp[sizertemp* thread_id];
 
 		for (lb=0;lb<nlb;lb++){
@@ -1267,9 +1263,9 @@ void zlsum_fmod_inv_master
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
@@ -1282,18 +1278,15 @@ void zlsum_fmod_inv_master
 
 				} else { /* Diagonal process: X[i] += lsum[i]. */
 
-
-
 					// if ( frecv[lk]==0 ) { /* Becomes a leaf node. */
-
 #if ( PROFlevel>=1 )
 					TIC(t1);
 #endif
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
@@ -1302,9 +1295,9 @@ void zlsum_fmod_inv_master
 					ii = X_BLK( lk );
 					// for (jj=0;jj<num_thread;jj++)
 					RHS_ITERATE(j)
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (i = 0; i < iknsupc; ++i)
 							z_add(&x[i + ii + j*iknsupc],
 								  &x[i + ii + j*iknsupc],
@@ -1331,9 +1324,9 @@ void zlsum_fmod_inv_master
 								&alpha, Linv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP
-							#pragma omp simd
-						#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							z_copy(&x[ii+i],&rtemp_loc[i]);
 						}
@@ -1353,11 +1346,10 @@ void zlsum_fmod_inv_master
 #if ( PROFlevel>=1 )
 					TOC(t2, t1);
 					stat[thread_id]->utime[SOL_TRSM] += t2;
-
 #endif
 
 					stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
-					+ 10 * knsupc * nrhs; /* complex division */
+								    + 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
 					printf("(%2d) Solve X[%2d]\n", iam, ik);
@@ -1375,13 +1367,12 @@ void zlsum_fmod_inv_master
 					 * Perform local block modifications.
 					 */
 
-					// #ifdef _OPENMP
-					// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
-					// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1)
+// #endif
 					{
 						nlb1 = lsub1[0] - 1;
 
-
 						zlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik,
 								fmod, nlb1, xsup,
 								grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread);
@@ -1393,8 +1384,8 @@ void zlsum_fmod_inv_master
 		}
 		// }
 		stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc;
-	} /* if nlb>0*/
-} /* zLSUM_FMOD_INV */
+	} /* end if nlb>0*/
+} /* end zlsum_fmod_inv_master */
 
 
 
@@ -1454,7 +1445,7 @@ void zlsum_bmod_inv
 	float msg_vol = 0, msg_cnt = 0;
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
-	int_t dword = sizeof (double);
+	int_t dword = sizeof(double);
 	int_t aln_d,aln_i;
 	aln_d = ceil(CACHELINE/(double)dword);
 	aln_i = ceil(CACHELINE/(double)iword);
@@ -1518,9 +1509,9 @@ void zlsum_bmod_inv
 						fnz = usub[i + jj];
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+//#ifdef _OPENMP  
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -1531,7 +1522,7 @@ void zlsum_bmod_inv
 								stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz);
 
 						}
-					} /* for jj ... */
+					} /* end for jj ... */
 				}
 
 #if ( PROFlevel>=1 )
@@ -1539,7 +1530,6 @@ void zlsum_bmod_inv
 				stat[thread_id1]->utime[SOL_GEMM] += t2;
 #endif
 
-
 		#ifdef _OPENMP
 		#pragma omp atomic capture
 		#endif
@@ -1551,9 +1541,9 @@ void zlsum_bmod_inv
 					if ( iam != p ) {
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
@@ -1575,12 +1565,11 @@ void zlsum_bmod_inv
 #if ( PROFlevel>=1 )
 						TIC(t1);
 #endif
-
 						for (ii=1;ii<num_thread;ii++)
 							// if(ii!=thread_id1)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
@@ -1590,9 +1579,9 @@ void zlsum_bmod_inv
 						dest = &x[ii];
 
 						RHS_ITERATE(j)
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (i = 0; i < iknsupc; ++i)
 							    z_add(&dest[i + j*iknsupc],
 									  &dest[i + j*iknsupc],
@@ -1662,8 +1651,8 @@ void zlsum_bmod_inv
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-							nroot_send_tmp = ++nroot_send[0];
-							root_send[(nroot_send_tmp-1)*aln_i] = lk1;
+							    nroot_send_tmp = ++nroot_send[0];
+							    root_send[(nroot_send_tmp-1)*aln_i] = lk1;
 							// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],'z');
 							}
 
@@ -1671,9 +1660,9 @@ void zlsum_bmod_inv
 							 * Perform local block modifications.
 							 */
 							if ( Urbs[lk1] ){
-								// #ifdef _OPENMP
-								// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-								// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 								{
 								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
@@ -1683,8 +1672,8 @@ void zlsum_bmod_inv
 						// } /* if brecv[ik] == 0 */
 					}
 				} /* if bmod[ik] == 0 */
-			}
-		}
+			} /* end for ub = ... */
+		} /* end for taskloop nn = ... */
 
 	} else {
 
@@ -1713,18 +1702,18 @@ void zlsum_bmod_inv
 					fnz = usub[i + jj];
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 
-								{
+							{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
 								z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow],
 									  &temp);
 								++uptr;
-								}
-								stat[thread_id]->ops[SOLVE] += 8 * (iklrow - fnz);
+							}
+							stat[thread_id]->ops[SOLVE] += 8 * (iklrow - fnz);
 					}
 				} /* for jj ... */
 			}
@@ -1745,9 +1734,9 @@ void zlsum_bmod_inv
 				if ( iam != p ) {
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 							z_add(&lsum[il + jj ],
 								  &lsum[il + jj ],
@@ -1771,9 +1760,9 @@ void zlsum_bmod_inv
 
 					for (ii=1;ii<num_thread;ii++)
 						// if(ii!=thread_id)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (jj=0;jj<iknsupc*nrhs;jj++)
 								z_add(&lsum[il + jj ],
 									  &lsum[il + jj ],
@@ -1783,9 +1772,9 @@ void zlsum_bmod_inv
 					dest = &x[ii];
 
 					RHS_ITERATE(j)
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (i = 0; i < iknsupc; ++i)
 							z_add(&dest[i + j*iknsupc],
 								  &dest[i + j*iknsupc],
@@ -1813,9 +1802,9 @@ void zlsum_bmod_inv
 									&alpha, Uinv, &iknsupc, &x[ii],
 									&iknsupc, &beta, rtemp_loc, &iknsupc );
 	#endif
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+				#ifdef _OPENMP
+				#pragma omp simd
+				#endif
 							for (i=0 ; i<iknsupc*nrhs ; i++){
 								z_copy(&x[ii+i],&rtemp_loc[i]);
 							}
@@ -1864,25 +1853,25 @@ void zlsum_bmod_inv
 						 */
 						if ( Urbs[lk1] )
 
-							// if(Urbs[lk1]>16){
-							// #ifdef _OPENMP
-							// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-							// #endif
+// if(Urbs[lk1]>16){
+// #ifdef _OPENMP
+// #pragma omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 							// 	zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 									//	Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
 									//	stat, root_send, nroot_send, sizelsum,sizertemp);
 							//}else{
-								zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
-										Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-										stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
+							zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
+								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+								stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread);
 							//}
 
 					// } /* if brecv[ik] == 0 */
 				}
 			} /* if bmod[ik] == 0 */
 
-		} /* for ub ... */
-	}
+		} /* end for ub ... */
+	} /* end else ... */
 
 } /* zlSUM_BMOD_inv */
 
@@ -2009,9 +1998,9 @@ void zlsum_bmod_inv_master
 						fnz = usub[i + jj];
 						if ( fnz < iklrow ) { /* Nonzero segment. */
 							/* AXPY */
-							#ifdef _OPENMP
-							#pragma omp simd
-							#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 							for (irow = fnz; irow < iklrow; ++irow)
 								{
 								zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -2056,9 +2045,9 @@ void zlsum_bmod_inv_master
 					fnz = usub[i + jj];
 					if ( fnz < iklrow ) { /* Nonzero segment. */
 						/* AXPY */
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+//#ifdef _OPENMP
+//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd
+//#endif
 						for (irow = fnz; irow < iklrow; ++irow)
 							{
 							zz_mult(&temp, &uval[uptr], &y[jj]);
@@ -2097,9 +2086,9 @@ void zlsum_bmod_inv_master
 			if ( iam != p ) {
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						z_add(&lsum[il + jj ],
 							  &lsum[il + jj ],
@@ -2118,9 +2107,9 @@ void zlsum_bmod_inv_master
 #endif
 				for (ii=1;ii<num_thread;ii++)
 					// if(ii!=thread_id)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (jj=0;jj<iknsupc*nrhs;jj++)
 						z_add(&lsum[il + jj ],
 							  &lsum[il + jj ],
@@ -2130,9 +2119,9 @@ void zlsum_bmod_inv_master
 				dest = &x[ii];
 
 				RHS_ITERATE(j)
-					#ifdef _OPENMP
-					#pragma omp simd
-					#endif
+		#ifdef _OPENMP
+		#pragma omp simd
+		#endif
 					for (i = 0; i < iknsupc; ++i)
 						z_add(&dest[i + j*iknsupc],
 							  &dest[i + j*iknsupc],
@@ -2160,9 +2149,9 @@ void zlsum_bmod_inv_master
 								&alpha, Uinv, &iknsupc, &x[ii],
 								&iknsupc, &beta, rtemp_loc, &iknsupc );
 #endif
-						#ifdef _OPENMP
-						#pragma omp simd
-						#endif
+			#ifdef _OPENMP
+			#pragma omp simd
+			#endif
 						for (i=0 ; i<iknsupc*nrhs ; i++){
 							z_copy(&x[ii+i],&rtemp_loc[i]);
 						}
@@ -2206,9 +2195,9 @@ void zlsum_bmod_inv_master
 					 * Perform local block modifications.
 					 */
 					if ( Urbs[lk1] ){
-						// #ifdef _OPENMP
-						// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
-						// #endif
+// #ifdef _OPENMP
+// #pragma	omp	task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied
+// #endif
 						{
 						zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,
 								Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index 72aaa42f..2eeb0c31 100644
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -500,9 +500,9 @@ zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
     for (i = 0; i < nb; ++i) 
 	if ( Llu->Lrowind_bc_ptr[i] ) {
 	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-#if 0 // Sherry: the following is not allocated with cudaHostAlloc    
+#if 0 // Sherry: the following is not allocated with gpuHostAlloc    
     //#ifdef GPU_ACC
-	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+	    checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i]));
 #endif
 	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
 	}
@@ -940,27 +940,53 @@ int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
  */
 void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct)
 {
-    int_t *it;
-
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-
-    if ( options->RefineInitialized ) {
-        pzgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
+    if ( options->SolveInitialized ) {
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+        if ( options->RefineInitialized ) {
+            pzgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
     }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it);
-    options->SolveInitialized = NO;
 } /* zSolveFinalize */
 
+void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->row_counts_int);  // free displacements and counts 
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(A3d->nnz_counts_int);
+    SUPERLU_FREE(A3d->nnz_disp);
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* zDestroy_A3d_gathered_on_2d */
+
+
 /*! \brief Check the inf-norm of the error vector
  */
 void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
-		      doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid)
+		      doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm)
 {
     double err, xnorm, temperr, tempxnorm;
     doublecomplex *x_work, *xtrue_work;
@@ -980,8 +1006,8 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx
       /* get the golbal max err & xnrom */
       temperr = err;
       tempxnorm = xnorm;
-      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
-      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm);
+      MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+      MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm);
 
       err = err / xnorm;
       if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
diff --git a/SRC/sSchCompUdt-2Ddynamic.c b/SRC/sSchCompUdt-2Ddynamic.c
new file mode 100644
index 00000000..f4dc643e
--- /dev/null
+++ b/SRC/sSchCompUdt-2Ddynamic.c
@@ -0,0 +1,714 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief This file contains the main loop of pdgstrf which involves rank k
+ *        update of the Schur complement.
+ *        Uses 2D partitioning for the scatter phase.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 14, 2017
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros),
+ *     gather only trailing columns of the L-panel corresponding to the nonzero
+ *     of U-rows.
+ *   - Padding zeros for nice dimensions of GEMM.
+ *
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ */
+
+#define SCHEDULE_STRATEGY guided
+
+/*
+ * Buffers:
+ *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ *                                            (A matrix in C := A*B )
+ *     bigU : stores the U-panel (B matrix in C := A*B)
+ *     bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
+
+if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+    int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
+    int temp_nbrow;   /* nonzero rows in current block L(i,k) */
+    lptr  = lptr0;
+    luptr = luptr0;
+    int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+			   and remaining part.  */
+
+    /*******************************************************************
+     * Separating L blocks into the top part within look-ahead window
+     * and the remaining ones.
+     *******************************************************************/
+
+     int lookAheadBlk=0, RemainBlk=0;
+
+     tt_start = SuperLU_timer_();
+
+     /* Sherry -- can this loop be threaded?? */
+     /* Loop through all blocks in L(:,k) to set up pointers to the start
+      * of each block in the data arrays.
+      *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
+      *   - lookAheadStRow[i] := number of nonzero rows before block i
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[]
+      *   - (ditto Remain_Info[i])
+      */
+     for (int i = 0; i < nlb; ++i) {
+	 ib = lsub[lptr];            /* Block number of L(i,k). */
+	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+
+	 int look_up_flag = 1; /* assume ib is outside look-up window */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+	      ++j) {
+		 if ( ib == perm_c_supno[j] ) {
+		     look_up_flag = 0; /* flag ib within look-up window */
+                     break;            /* Sherry -- can exit the loop?? */
+                 }
+	 }
+
+	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
+	     if (lookAheadBlk==0) {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
+	     } else {
+		 lookAheadFullRow[lookAheadBlk] =
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
+	     }
+	     lookAheadStRow[lookAheadBlk] = cum_nrow;
+	     lookAhead_lptr[lookAheadBlk] = lptr;
+	     lookAhead_ib[lookAheadBlk] = ib;
+	     lookAheadBlk++;
+	 } else { /* ib is not in look-up window */
+	     if ( RemainBlk==0 ) {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow;
+	     } else {
+		 Remain_info[RemainBlk].FullRow =
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;
+	     }
+             RemainStRow[RemainBlk] = cum_nrow;
+             // Remain_lptr[RemainBlk] = lptr;
+	     Remain_info[RemainBlk].lptr = lptr;
+	     // Remain_ib[RemainBlk] = ib;
+	     Remain_info[RemainBlk].ib = ib;
+	     RemainBlk++;
+	 }
+
+         cum_nrow += temp_nbrow;
+
+	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+	 lptr += temp_nbrow;     /* Move to next block */
+	 luptr += temp_nbrow;
+     }  /* for i ... set up pointers for all blocks in L(:,k) */
+
+     lptr = lptr0;
+     luptr = luptr0;
+
+     /* leading dimension of L look-ahead buffer, same as Lnbrow */
+     //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+     Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+     /* leading dimension of L remaining buffer, same as Rnbrow */
+     //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+     /* Piyush fix */
+     //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+     nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
+     LookAheadRowSepMOP += 2*knsupc*(nbrow);
+
+     /***********************************************
+      * Gather U blocks (AFTER LOOK-AHEAD WINDOW)   *
+      ***********************************************/
+     tt_start = SuperLU_timer_();
+
+     if ( nbrow > 0 ) { /* L(:,k) is not empty */
+	 /*
+	  * Counting U blocks
+	  */
+     	 ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
+	 int temp_ncols = 0;
+
+	 /* jj0 contains the look-ahead window that was updated in
+	    dlook_ahead_update.c. Now the search can continue from that point,
+	    not to start from block 0. */
+#if 0 // Sherry comment out 5/21/2018
+	 /* Save pointers at location right after look-ahead window
+	    for later restart. */
+	 iukp0 = iukp;
+	 rukp0 = rukp;
+#endif
+
+	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+
+         /*
+	  * Loop through all blocks in U(k,:) to set up pointers to the start
+          * of each block in the data arrays, store them in Ublock_info[j]
+          * for block U(k,j).
+  	  */
+	 for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+	     temp_ncols = 0;
+#if 1
+	     /* Cannot remove following call, since perm_u != Identity  */
+	     arrive_at_ublock(
+			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			      iukp0, rukp0, usub, perm_u, xsup, grid
+			      );
+#else
+	     jb = usub[iukp];
+	     /* ljb = LBj (jb, grid);   Local block number of U(k,j). */
+	     nsupc = SuperSize(jb);
+	     iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+	     Ublock_info[j].iukp = iukp;
+	     Ublock_info[j].rukp = rukp;
+	     Ublock_info[j].jb = jb;
+
+	     /* if ( iam==0 )
+		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+			"Ublock_info[j].jb %d, nsupc %d\n",
+			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+			Ublock_info[j].jb, nsupc); */
+
+	     /* Prepare to call GEMM. */
+	     jj = iukp;
+	     for (; jj < iukp+nsupc; ++jj) {
+		 segsize = klst - usub[jj];
+		 if ( segsize ) {
+                    ++temp_ncols;
+                    if ( segsize > ldu ) ldu = segsize;
+		 }
+	     }
+
+	     Ublock_info[j].full_u_cols = temp_ncols;
+	     ncols += temp_ncols;
+#if 0 // Sherry comment out 5/31/2018 */
+	     /* Jump number of nonzeros in block U(k,jj);
+		Move to block U(k,j+1) in nzval[] array.  */
+	     rukp += usub[iukp - 1];
+	     iukp += nsupc;
+#endif
+         } /* end for j ... compute ldu & ncols */
+
+	 /* Now doing prefix sum on full_u_cols.
+	  * After this, full_u_cols is the number of nonzero columns
+          * from block 0 to block j.
+          */
+	 for ( j = jj0+1; j < nub; ++j) {
+	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
+	 }
+
+	 /* Padding zeros to make {m,n,k} multiple of vector length. */
+	 jj = 8; //n;
+	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+	     gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+	     gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+	     //gemm_n_pad = ncols;
+	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+	     gemm_k_pad = ldu;
+
+	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
+		 for (j = 0; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = 0; i < Rnbrow; ++i)
+		 for (j = ldu; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
+		 for (j = 0; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	     for (i = 0; i < ldu; ++i)
+		 for (j = ncols; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	 } else {
+	     gemm_m_pad = Rnbrow;
+	     gemm_n_pad = ncols;
+	     gemm_k_pad = ldu;
+	 }
+
+	 tempu = bigU; /* buffer the entire row block U(k,:) */
+
+         /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+    private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+    default (shared) schedule(SCHEDULE_STRATEGY)
+#endif
+        for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+
+            if (j==jj0) tempu = bigU;
+            //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+            else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
+
+            /* == processing each of the remaining columns in parallel == */
+#if 0
+	    /* Can remove following call, since search was already done.  */
+            arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			     iukp0, rukp0, usub,perm_u, xsup, grid);
+#else
+	    iukp = Ublock_info[j].iukp;
+	    rukp = Ublock_info[j].rukp;
+	    jb = Ublock_info[j].jb;
+	    nsupc = SuperSize (jb );
+#endif
+            /* Copy from U(k,j) to tempu[], padding zeros.  */
+            for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+                    lead_zero = ldu - segsize;
+                    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+		    //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+		    for (i = 0; i < segsize; ++i)
+                    	tempu[i+lead_zero] = uval[rukp+i];
+                    rukp += segsize;
+                    tempu += gemm_k_pad;
+                }
+	    }
+        }   /* parallel for j = jj0 .. nub */
+
+#if 0
+	if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+	fflush(stdout);
+#endif
+
+        GatherMOP += 2*ldu*ncols;
+
+    }  /* end if (nbrow>0), end gather U blocks */
+
+    GatherUTimer += SuperLU_timer_() - tt_start;
+    int jj_cpu = nub;       /* limit between CPU and GPU */
+    int thread_id;
+    /*tempv = bigV;*/
+
+    /**********************
+     * Gather L blocks    *
+     **********************/
+     tt_start = SuperLU_timer_();
+
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+     for (i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest, temp_nbrow;
+	 if ( i==0 ) {
+	     StRowDest = 0;
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+
+	 int StRowSource = lookAheadStRow[i];
+
+	 /* Now copying one block into L lookahead buffer */
+	 /* #pragma omp parallel for (gives slow down) */
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+						    corresponding to zero U rows */
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+	     //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+	     tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+	     tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+	     memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+		    &lusup[luptr+j*nsupr + StRowSource],
+		    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... gather Lval blocks from lookahead window */
+
+     /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+    schedule(SCHEDULE_STRATEGY)
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+         int StRowDest, temp_nbrow;
+         if ( i==0 )  {
+	     StRowDest  = 0;
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+	 }
+
+	 int StRowSource = RemainStRow[i];
+
+	 /* Now copying a block into L remaining buffer */
+	 // #pragma omp parallel for (gives slow down)
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (int j = knsupc-ldu; j < knsupc; ++j) {
+	     // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+	     //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+	     tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+	     tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+	     memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+		    &lusup[luptr+j*nsupr + StRowSource],
+                    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... copy Lval into the remaining buffer */
+
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
+
+
+     /*************************************************************************
+      * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+      *************************************************************************/
+     tempu = bigU;  /* setting to the start of padded U(k,:) */
+
+     if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+	 /***************************************************************
+	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+	  ***************************************************************/
+
+	 /* Count flops for total GEMM calls */
+	 ncols = Ublock_info[nub-1].full_u_cols;
+ 	 flops_t flps = 2.0 * (flops_t)Lnbrow * ldu * ncols;
+	 LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+	 schur_flop_counter += flps;
+	 stat->ops[FACT]    += flps;
+	 LookAheadGEMMFlOp  += flps;
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id)
+	 {
+#ifdef _OPENMP	 
+	   thread_id = omp_get_thread_num();
+#else	   
+	   thread_id = 0;
+#endif
+
+	   /* Ideally, should organize the loop as:
+	      for (j = 0; j < nub; ++j) {
+	          for (lb = 0; lb < lookAheadBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+		  }
+	      }
+	      But now, we use collapsed loop to achieve more parallelism.
+	      Total number of block updates is:
+	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+	   */
+
+	   int i = sizeof(int);
+	   int* indirect_thread    = indirect + (ldt + CACHELINE/i) * thread_id;
+	   int* indirect2_thread   = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	   thread_id = 0;
+	   int* indirect_thread    = indirect;
+	   int* indirect2_thread   = indirect2;
+#endif
+	   /* Each thread is assigned one loop index ij, responsible for
+	      block update L(lb,k) * U(k,j) -> tempv[]. */
+	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	       /* jj0 starts after look-ahead window. */
+            int j   = ij/lookAheadBlk + jj0;
+            int lb  = ij%lookAheadBlk;
+
+            /* Getting U block U(k,j) information */
+            /* unsigned long long ut_start, ut_end; */
+            int_t rukp =  Ublock_info[j].rukp;
+            int_t iukp =  Ublock_info[j].iukp;
+            int jb   =  Ublock_info[j].jb;
+            int nsupc = SuperSize(jb);
+            int ljb = LBj (jb, grid);  /* destination column block */
+            int st_col;
+            int ncols;  /* Local variable counts only columns in the block */
+            if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
+                ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+                st_col = Ublock_info[j-1].full_u_cols;
+            } else {
+                ncols  = Ublock_info[j].full_u_cols;
+                st_col = 0;
+            }
+
+            /* Getting L block L(i,k) information */
+            int_t lptr = lookAhead_lptr[lb];
+            int ib   = lookAhead_ib[lb];
+            int temp_nbrow = lsub[lptr+1];
+            lptr += LB_DESCRIPTOR;
+            int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+
+	    /* Block-by-block GEMM in look-ahead window */
+#if 0
+	    i = sizeof(float);
+	    float* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+	    float* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
+#if ( PRNTlevel>= 1)
+	    if (thread_id == 0) tt_start = SuperLU_timer_();
+	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
+	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+
+#if (PRNTlevel>=1 )
+	    if (thread_id == 0) {
+		tt_end = SuperLU_timer_();
+		LookAheadGEMMTimer += tt_end - tt_start;
+		tt_start = tt_end;
+	    }
+#endif
+            if ( ib < jb ) {
+                sscatter_u (
+				 ib, jb,
+				 nsupc, iukp, xsup,
+				 klst, temp_nbrow,
+				 lptr, temp_nbrow, lsub,
+				 usub, tempv1,
+				 Ufstnz_br_ptr, Unzval_br_ptr,
+				 grid
+			        );
+            } else {
+#if 0
+		//#ifdef USE_VTUNE
+	    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	    __itt_resume(); // start VTune, again use 2 underscores
+#endif
+                sscatter_l (
+				 ib, ljb,
+				 nsupc, iukp, xsup,
+ 				 klst, temp_nbrow,
+				 lptr, temp_nbrow,
+				 usub, lsub, tempv1,
+				 indirect_thread, indirect2_thread,
+				 Lrowind_bc_ptr, Lnzval_bc_ptr,
+				 grid
+				);
+#if 0
+		//#ifdef USE_VTUNE
+		__itt_pause(); // stop VTune
+		__SSC_MARK(0x222); // stop SDE tracing
+#endif
+            }
+
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0)
+		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+	   } /* end omp for ij = ... */
+
+#ifdef _OPENMP
+	 } /* end omp parallel */
+#endif
+     } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
+    /***************************************************************
+     * Updating remaining rows and columns on CPU.
+     ***************************************************************/
+    ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+	double flps = 2.0 * (double)Rnbrow * ldu * ncols;
+	schur_flop_counter  += flps;
+	stat->ops[FACT]     += flps;
+
+#if ( PRNTlevel>=1 )
+	RemainGEMM_flops += flps;
+	gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+	gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+	tt_start = SuperLU_timer_();
+	/* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+	   iam, k0,Rnbrow,ldu,ncols,RemainBlk);  fflush(stdout);
+	assert( Rnbrow*ncols < bigv_size ); */
+#endif
+	/* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
+
+#if ( PRNTlevel>=1 )
+	tt_end = SuperLU_timer_();
+	RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+	//fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+	// (tt_end - tt_start)*1e6); // time in microsecond
+	//fflush(fgemm);
+	gemm_stats[gemm_count].m = Rnbrow;
+	gemm_stats[gemm_count].n = ncols;
+	gemm_stats[gemm_count].k = ldu;
+	gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+	tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* Scatter into destination block-by-block. */
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(thread_id)
+	{
+#ifdef _OPENMP	
+	    thread_id = omp_get_thread_num();
+#else	    
+	    thread_id = 0;
+#endif
+
+	    /* Ideally, should organize the loop as:
+               for (j = 0; j < jj_cpu; ++j) {
+	           for (lb = 0; lb < RemainBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+                   }
+               }
+	       But now, we use collapsed loop to achieve more parallelism.
+	       Total number of block updates is:
+	       (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	    */
+
+	    int i = sizeof(int);
+	    int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+	    int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	    thread_id = 0;
+	    int* indirect_thread = indirect;
+	    int* indirect2_thread = indirect2;
+#endif
+	    /* Each thread is assigned one loop index ij, responsible for
+	       block update L(lb,k) * U(k,j) -> tempv[]. */
+	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+		/* jj_cpu := nub, jj0 starts after look-ahead window. */
+		int j   = ij / RemainBlk + jj0; /* j-th block in U panel */
+		int lb  = ij % RemainBlk;       /* lb-th block in L panel */
+
+		/* Getting U block U(k,j) information */
+		/* unsigned long long ut_start, ut_end; */
+		int_t rukp =  Ublock_info[j].rukp;
+		int_t iukp =  Ublock_info[j].iukp;
+		int jb   =  Ublock_info[j].jb;
+		int nsupc = SuperSize(jb);
+		int ljb = LBj (jb, grid);
+		int st_col;
+		int ncols;
+		if ( j>jj0 ) {
+		    ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+		    st_col = Ublock_info[j-1].full_u_cols;
+		} else {
+		    ncols = Ublock_info[j].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* Getting L block L(i,k) information */
+		int_t lptr = Remain_info[lb].lptr;
+		int ib   = Remain_info[lb].ib;
+		int temp_nbrow = lsub[lptr+1];
+		lptr += LB_DESCRIPTOR;
+		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
+		float* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+		/* Now scattering the block */
+
+		if ( ib < jb ) {
+		    sscatter_u (
+				ib, jb,
+				nsupc, iukp, xsup,
+				//klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+				klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+				lptr, temp_nbrow, /* row dimension of the block */
+				lsub, usub, tempv1,
+				Ufstnz_br_ptr, Unzval_br_ptr,
+				grid
+				);
+		} else {
+		    sscatter_l(
+			       ib, ljb,
+			       nsupc, iukp, xsup,
+			       //klst, temp_nbrow, Sherry
+			       klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+			       lptr, temp_nbrow, /* row dimension of the block */
+			       usub, lsub, tempv1,
+			       indirect_thread, indirect2_thread,
+			       Lrowind_bc_ptr,Lnzval_bc_ptr,
+			       grid
+			       );
+		}
+
+	    } /* end omp for (int ij =...) */
+
+#ifdef _OPENMP
+	} /* end omp parallel region */
+#endif
+
+#if ( PRNTlevel>=1 )
+	RemainScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
+#endif
+
+    } /* end if Rnbrow>0 ... update remaining block */
+
+}  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/sSchCompUdt-cuda.c b/SRC/sSchCompUdt-cuda.c
new file mode 100644
index 00000000..579cc6ea
--- /dev/null
+++ b/SRC/sSchCompUdt-cuda.c
@@ -0,0 +1,589 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief This file contains the main loop of psgstrf which involves
+ *        rank k update of the Schur complement.
+ *        Uses CUDA GPU.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY dynamic
+
+#define gpublasCheckErrors(fn) \
+    do { \
+        gpublasStatus_t __err = fn; \
+        if (__err != GPUBLAS_STATUS_SUCCESS) { \
+            fprintf(stderr, "Fatal gpublas error: %d (at %s:%d)\n", \
+                (int)(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while(0);
+
+int full;
+double gemm_timer = 0.0;
+double scatter_timer = 0.0;
+
+if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
+    ldu   =0;
+    full  =1;
+    int cum_nrow;
+    int temp_nbrow;
+
+    lptr = lptr0;
+    luptr = luptr0;
+
+    nbrow= lsub[1];
+    if (myrow==krow) nbrow = lsub[1]-lsub[3];
+
+    if (nbrow>0) {
+
+        // Maximum number of columns that can fit in dC[buffer_size] on GPU 
+#if 0   // max_ldu can be < ldt, so bigu_size/ldt may be smaller, giving false alarm
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
+#else // Sherry fix
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow, max_ncols);
+#endif
+	
+        int num_streams_used, /* number of streams that will be used*/
+        ncpu_blks;            /* the leading number of CPU dgemm blks
+			         in each partition */
+        int jjj, jjj_st,jjj_global;
+        for (j = jj0; j < nub; ++j) {
+            arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
+
+            ncols =0 ;  //initialize at 0
+            jj = iukp;
+            int temp_ldu=0;
+            for (; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+		    ++ncols;
+		}
+                temp_ldu = SUPERLU_MAX(temp_ldu, segsize);
+            }
+
+            full_u_cols[j] = ncols;
+            blk_ldu[j] = temp_ldu;
+        } /* end for j = jj0..nub */
+
+        jjj = jj0; /* jj0 is the first block column after look-ahead window */
+
+        // #pragma omp barrier
+        while ( jjj < nub ) {
+            jjj_st=jjj;
+#ifdef _OPENMP
+#pragma omp single
+#endif
+            {
+                ldu = blk_ldu[jjj_st];
+                for (j = jjj_st; j < nub ; ++j) {
+
+                    /* prefix sum */
+                    if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
+
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);
+
+                    /* break condition */
+                    /* the number of columns that can be processed on GPU is
+		       limited by buffer size */
+                    if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
+                        break; // block column j+1 does not fit in GPU memory */
+                    }
+                } /* end for j=jjj_st to nub */
+
+                jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj < nub */
+
+                // TAU_STATIC_TIMER_START("work_divison");
+                /* Divide CPU-GPU gemm here.
+		 * If there is only one block, we leave it on CPU.
+		 */
+                gemm_division_cpu_gpu(
+		       &num_streams_used,/*number of streams that will be used*/
+		       stream_end_col,   /*array holding last column blk for each partition*/
+		       &ncpu_blks,       /*number of CPU gemm blks*/
+		       		// Following are inputs
+		       nbrow,            /*number of rows in A matrix*/
+		       ldu,              /*value of k in dgemm*/
+		       nstreams,
+		       full_u_cols + jjj_st, /*array containing prefix sum of GPU workload*/
+		       jjj_global - jjj_st /*number of block columns on GPU.
+		       		             If only one block, leave it on CPU*/
+                );
+                // TAU_STATIC_TIMER_STOP("work_divison");
+
+            } /* pragma omp single */
+
+            jjj = jjj_global; /* Move to the next [ CPU : GPU ] partition */
+	    
+#if 0 // !!Sherry: this test is not necessary
+	    // if jjj_global - jjj_st == 1, everything is on CPU.
+	    // bigv_size is calculated sufficiently large.
+            if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
+                printf("allocate more memory for buffer !!!!\n"
+		       ".. jjj_st %d, nbrow %d, full_u_cols[jjj_st] %d, ncol_max %d\n",
+		       jjj_st, nbrow, full_u_cols[jjj_st], ncol_max);
+                if(nbrow * full_u_cols[jjj_st] > buffer_size)
+                    printf("[%d] needed %d > buffer_size %d\n",iam,nbrow*full_u_cols[jjj_st],buffer_size );
+		fflush(stdout);
+            }
+#endif
+
+            // #pragma omp barrier
+            /* gathering circuit */
+            assert(jjj_st<nub);
+            assert(jjj-1<nub);
+            // TAU_STATIC_TIMER_START("GATHER_U");
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY )
+#endif
+	    // Copy U segments into tempu, up to jjj_global block */	
+            for (j = jjj_st; j < jjj; ++j) {
+                if (j==jjj_st) tempu = bigU; /* leading block(s) on CPU */
+                else tempu = bigU + ldu*full_u_cols[j-1];
+
+                /* == processing each of the remaining columns == */
+                arrive_at_ublock(j,&iukp,&rukp,&jb,&ljb,&nsupc,
+				 iukp0,rukp0,usub,perm_u,xsup,grid);
+
+                // copy block j into tempu
+                for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                    segsize = klst - usub[jj];
+                    if ( segsize ) {
+                        lead_zero = ldu - segsize;
+                        for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                        tempu += lead_zero;
+                        for (i = 0; i < segsize; ++i)
+                            tempu[i] = uval[rukp+i];
+                        rukp += segsize;
+                        tempu += segsize;
+                    }
+                }
+
+                rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+
+            } /* end for j=jjj_st to jjj */
+
+	    if ( num_streams_used > 0 ) {
+#ifdef PI_DEBUG
+		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size ); fflush(stdout);
+		assert(nbrow*ldu<=ldt*max_row_size);
+#endif
+		gpuMemcpy2DAsync(dA, nbrow*sizeof(float),
+				  &lusup[luptr+(knsupc-ldu)*nsupr],
+				  nsupr*sizeof(float), nbrow*sizeof(float),
+				  ldu, gpuMemcpyHostToDevice, streams[0]);
+	    }
+
+	    for (int i = 0; i < num_streams_used; ++i) { // streams on GPU
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1];
+		// st starts after the leading ncpu_blks
+		int st_col = full_u_cols[st-1];
+		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
+		tempu = bigU;
+
+		float *tempv1 = bigV + full_u_cols[st-1]*nbrow;
+
+		/* Following is for testing purpose */
+		if ( num_col_stream > 0 ) {		
+#ifdef GPU_ACC
+		    int stream_id = i;
+		    int b_offset  = ldu * st_col;
+		    int c_offset  = st_col * nbrow;
+		    size_t B_stream_size = ldu * num_col_stream * sizeof(float);
+		    size_t C_stream_size = nbrow * num_col_stream * sizeof(float);
+
+		    assert(nbrow*(st_col+num_col_stream) < buffer_size);
+
+		    gpuMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
+		    		    gpuMemcpyHostToDevice, streams[stream_id]);
+
+		    gpublasCheckErrors(
+				  gpublasSetStream(handle[stream_id],
+						  streams[stream_id])
+				     );
+
+		    gpublasCheckErrors(
+				  gpublasSgemm(handle[stream_id],
+					      GPUBLAS_OP_N, GPUBLAS_OP_N,
+					      nbrow, num_col_stream, ldu,
+                                              &alpha, dA, nbrow,
+					      &dB[b_offset], ldu,
+					      &beta, &dC[c_offset],
+                                              nbrow)
+				  );
+
+		    checkGPU( gpuMemcpyAsync(tempv1, dC+c_offset,
+					   C_stream_size,
+					   gpuMemcpyDeviceToHost,
+					   streams[stream_id]) );
+#else /*-- on CPU --*/
+
+	            my_sgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
+			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
+			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
+			      tempv1, &nbrow, 1, 1);
+#endif
+   	        } // end if num_col_stream > 0
+
+	    } /* end for i = 1 to num_streams used */
+
+	    /* Special case for CPU -- leading block columns are computed 
+	       on CPU in order to mask the GPU data transfer latency */
+	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
+	    int st_col = 0; /* leading part on CPU */
+	    tempv = bigV + nbrow * st_col;
+	    tempu = bigU;
+
+	    double tstart = SuperLU_timer_();
+#if defined (USE_VENDOR_BLAS)
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
+#else
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow);
+#endif
+	    gemm_timer += SuperLU_timer_() -tstart;
+	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
+
+            /* Now scattering blocks computed by CPU */
+            int temp_ncol;
+
+            /* scatter leading blocks which CPU has computated */
+            tstart = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel  \
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,	\
+	    segsize,lead_zero,					\
+	    ib, temp_nbrow,ilst,lib,index,			\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,			\
+	    nzval,     lb ,                     jj, i)		\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+		int num_threads = omp_get_num_threads();
+#else
+                int thread_id = 0;
+		int num_threads = 1;
+#endif		
+
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+
+                if ( ncpu_blks < num_threads ) {
+                    // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
+
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_st, jjj_st+ncpu_blks\n",j,jjj_st,jjj_st+ncpu_blks);
+#endif
+
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+#endif
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            int cum_nrow = 0;
+                            int temp_nbrow;
+                            lptr = lptr0;
+                            luptr = luptr0;
+                            for (int i = 0; i < lb; ++i) {
+                                ib = lsub[lptr];        /* Row block L(i,k). */
+                                temp_nbrow = lsub[lptr+1];   /* Number of full rows. */
+                                lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+                                lptr += temp_nbrow;
+                                luptr += temp_nbrow;
+                                cum_nrow +=temp_nbrow;
+                            }
+
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) {  /* A(i,j) is in U. */
+                                #ifdef PI_DEBUG
+                                    printf("cpu scatter \n");
+                                    printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+                                #endif
+
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                    // TAU_STATIC_TIMER_STOP("SPECIAL_CPU_SCATTER");
+                } else { // ncpu_blks >= omp_get_num_threads()
+#ifdef _OPENMP
+#pragma omp for schedule(SCHEDULE_STRATEGY) nowait
+#endif
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d\n",j);
+#endif
+
+                        /* == processing each of the remaining columns == */
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+			    /* Now gather the result into the destination block. */
+			    if ( ib < jb ) {  /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("cpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+
+				tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+			    } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+			    } /* if ib < jb ... */
+
+			    lptr += temp_nbrow;
+			    luptr += temp_nbrow;
+			    cum_nrow += temp_nbrow;
+
+			} /* for lb ... */
+
+			luptr=luptr0;
+		    } /* for j = jjj_st ... */
+		}     /* else (ncpu_blks >= omp_get_num_threads()) */
+	    }         /* parallel region */
+
+	    scatter_timer += SuperLU_timer_() - tstart;
+	    
+	    // Scatter tempv(:, (jjj_st1 : jjj_global)) computed on GPU.
+#ifdef _OPENMP
+#pragma omp parallel							\
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
+	    segsize,lead_zero,						\
+	    ib, temp_nbrow,ilst,lib,index,				\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,				\
+	    nzval,     lb ,                     jj, i)			\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+                for(i = 0; i < num_streams_used; i++) { /* i is private variable */
+                    checkGPU(gpuStreamSynchronize (streams[i]));
+		    // jjj_st1 := first block column on GPU stream[i]
+		    int jjj_st1 = (i==0) ? jjj_st + ncpu_blks : jjj_st + stream_end_col[i-1];
+                    int jjj_end = jjj_st + stream_end_col[i];
+                    assert(jjj_end-1<nub);
+                    assert(jjj_st1>jjj_st) ;
+
+                    /* now scatter it */
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+                    for (j = jjj_st1; j < jjj_end; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_end %d, nub %d\n",j,jjj_end,nub); fflush(stdout);
+#endif
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th
+			   block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+                        for (lb = 0; lb < nlb; lb++) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+                            /* Now scatter result into destination block. */
+                            if ( ib < jb ) { /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("gpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("gpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                } /* end for i = 0 to nstreams */
+		
+                // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
+                // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
+		
+            } /* end pragma omp parallel */
+            // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
+	    
+        }  /* end while(jjj<nub) */
+
+    } /* if nbrow>0 */
+
+ }   /* if msg1 and msg 2 */
+
+
+
diff --git a/SRC/sbinary_io.c b/SRC/sbinary_io.c
new file mode 100644
index 00000000..b1a3c81a
--- /dev/null
+++ b/SRC/sbinary_io.c
@@ -0,0 +1,42 @@
+#include "superlu_sdefs.h"
+
+int
+sread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, 
+	     float **nzval, int_t **rowind, int_t **colptr)
+{
+    size_t isize = sizeof(int_t), dsize = sizeof(float);
+    int nnz_read;
+    fread(n, isize, 1, fp);
+    fread(nnz, isize, 1, fp);
+    printf("fread n " IFMT "\tnnz " IFMT "\n", *n, *nnz);
+    *m = *n;
+    *colptr = intMalloc_dist(*n+1);
+    *rowind = intMalloc_dist(*nnz);
+    *nzval  = floatMalloc_dist(*nnz);
+    fread(*colptr, isize, (size_t) (*n + 1), fp);
+    fread(*rowind, isize, (size_t) *nnz, fp);
+    nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
+    printf("# of floats fread: %d\n", nnz_read);
+    fclose(fp);
+    return 0;
+}
+
+int
+swrite_binary(int_t n, int_t nnz,
+	      float *values, int_t *rowind, int_t *colptr)
+{       
+      FILE  *fp1;
+      int nnz_written;
+      size_t isize = sizeof(int_t), dsize = sizeof(float);
+      fp1 = fopen("matrix.bin", "wb");
+      fwrite(&n, isize, 1, fp1);
+      fwrite(&nnz, isize, 1, fp1);
+      fwrite(colptr, isize, n+1, fp1);
+      fwrite(rowind, isize, nnz, fp1);
+      nnz_written = fwrite(values, dsize, nnz, fp1);
+      printf("n " IFMT ", # of float: " IFMT "\n", n, nnz);
+      printf("dump binary file ... # of float fwrite: %d\n", nnz_written);
+      assert(nnz_written==nnz);
+      fclose(fp1);
+      return 0;
+}
diff --git a/SRC/scan.cu b/SRC/scan.cu
new file mode 100644
index 00000000..5a23509c
--- /dev/null
+++ b/SRC/scan.cu
@@ -0,0 +1,194 @@
+#include <iostream>
+#include <cstdio>
+
+// typedef float pfx_dtype ; 
+
+int nextpow2(int v)
+
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+
+    return v;
+}
+
+__device__ int dnextpow2(int v)
+
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+
+    return v;
+}
+
+
+
+typedef int pfx_dtype ; 
+__global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n)
+{
+    extern __shared__ pfx_dtype temp[];
+    int n_original = n;
+    n = (n & (n - 1)) == 0? n: dnextpow2(n);
+    int thread_id = threadIdx.x;
+    int offset = 1;
+    if(2*thread_id  < n_original)
+        temp[2*thread_id] = inArr[2*thread_id]; 
+    else 
+        temp[2*thread_id] =0;
+
+
+    if(2*thread_id+1 <n_original)
+        temp[2*thread_id+1] = inArr[2*thread_id+1];
+    else 
+        temp[2*thread_id+1] =0;
+    
+    for (int d = n>>1; d > 0; d >>= 1) 
+    {
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            temp[bi] += temp[ai];
+        }
+        offset *= 2;
+    }
+    
+    if (thread_id == 0) { temp[n - 1] = 0; } 
+    for (int d = 1; d < n; d *= 2) 
+    {
+        offset >>= 1;
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            pfx_dtype t = temp[ai];
+            temp[ai] = temp[bi];
+            temp[bi] += t;
+        }
+    }
+    __syncthreads();
+    if(2*thread_id  < n_original)
+    outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory
+    if(2*thread_id+1  < n_original)
+    outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1];
+    __syncthreads();
+    if(2*thread_id  < n_original)
+    printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]);
+    if(2*thread_id+1  < n_original)
+    printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]);
+    __syncthreads();
+} 
+
+#define SELF_TEST 
+#ifdef SELF_TEST
+
+#include <iostream>
+#include "cub/cub.cuh"
+
+#define THREAD_BLOCK_SIZE 8
+
+
+// __global__
+// void cub_scan_test(int N)
+// {
+// 	int thread_id = threadIdx.x;
+// 	typedef cub::BlockScan<int, THREAD_BLOCK_SIZE > BlockScan; /*1D int data type*/
+
+// 	__shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+// 	extern __shared__ int* IndirectJ1;
+// 	extern __shared__ int* IndirectJ2= IndirectJ1+ N*sizeof(int);
+
+// 	if (thread_id < N)
+// 	{
+// 		IndirectJ1[thread_id] = 2*thread_id +1;
+// 	}
+
+// 	__syncthreads();
+// 	if (thread_id < THREAD_BLOCK_SIZE)
+// 		BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+// 	if (thread_id < THREAD_BLOCK_SIZE)
+// 		printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+// }
+
+
+
+// extern __shared__
+// #define THREAD_BLOCK_SIZE 7
+
+__global__ void initData(pfx_dtype* A, int n)
+{
+    int threadId = threadIdx.x;   
+    if(threadId<n)
+        A[threadId] = 2*threadId+1;
+        printf("A[%d] = %d \n",threadId,A[threadId]);
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    if(argc<2) 
+    {
+        std::cout<<"Error with number of arguments\n";
+        return -1;
+    }
+    int N = atoi(argv[1]);
+    int N2=N;
+    if((N & (N - 1)) == 0)
+    {
+        std::cout<<"Power of Two\n";
+    }
+    else
+    {
+        std::cout<<"Not a power of Two\n";
+        N2 = nextpow2(N);
+        std::cout<<"Using "<<N2<<"\n";
+    } 
+        
+    
+    
+    
+    
+    pfx_dtype *A, *xA;
+    cudaMalloc(&A, sizeof(pfx_dtype)*N);
+    cudaMalloc(&xA, sizeof(pfx_dtype)*N);
+    
+
+    initData<<< 1,THREAD_BLOCK_SIZE >>> (A,N);
+    if(cudaDeviceSynchronize() != cudaSuccess)
+        std::cout<<"Error- 0\n";
+    // prescan<<<  1,THREAD_BLOCK_SIZE/2,2*THREAD_BLOCK_SIZE*sizeof(pfx_dtype) >>> (xA, A, N);
+    
+    prescan<<<  1,(N+1)/2,2*N*sizeof(pfx_dtype) >>> (xA, A, N);
+    prescan<<<  1,N2,2*N*sizeof(pfx_dtype) >>> (xA, A, N);
+    if(cudaDeviceSynchronize() != cudaSuccess)
+        std::cout<<".....EXITING\n";   
+    else
+        std::cout<<"No errors reported\n";
+
+
+    // typedef cub::BlockScan<int, THREAD_BLOCK_SIZE> BlockScan; /*1D int data type*/
+	// __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+    // cub_scan_test <<<  1,THREAD_BLOCK_SIZE >>> (N);
+
+    return 0;
+}
+
+#endif 
\ No newline at end of file
diff --git a/SRC/scatter.c b/SRC/scatter.c
new file mode 100644
index 00000000..46926955
--- /dev/null
+++ b/SRC/scatter.c
@@ -0,0 +1,589 @@
+#include "superlu_ddefs.h"
+#include "scatter.h"
+//#include "compiler.h"
+
+#define ISORT
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+block_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+    thread_id = omp_get_thread_num();
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int_t ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    /* Getting L block information */
+    // int_t lptr = Remain_info[lb].lptr;
+    // int_t ib   = Remain_info[lb].ib;
+    // int_t temp_nbrow = lsub[lptr + 1];
+    // lptr += LB_DESCRIPTOR;
+    // int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+
+    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter */
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+block_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+
+        }
+
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter_lock */
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t block_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2, HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopLeft */
+
+int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopRight */
+
+
+int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    //printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomLeft */
+
+int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+   // printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: The following routines may conflict with dscatter.c
+ ******************************************************************/
+#if 1
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+}
+
+void   // SHERRY: NOT CALLED!!
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+#endif // comment out
+
diff --git a/SRC/scatter.h b/SRC/scatter.h
new file mode 100644
index 00000000..568a1687
--- /dev/null
+++ b/SRC/scatter.h
@@ -0,0 +1,147 @@
+#ifndef _SCATTER_H_
+#define _SCATTER_H_
+
+#ifdef CLEAN_SCATTER
+#define SCATTER_L_CPU  scatter_l
+#define SCATTER_U_CPU  scatter_u
+#else
+#define SCATTER_L_CPU  scatter_l
+#define SCATTER_U_CPU  scatter_u
+
+#endif
+
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid);
+
+void
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid);
+
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid);
+
+
+void
+block_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  );
+
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+block_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       );
+
+int_t block_gemm_scatterTopLeft( int_t lb,  int_t j,
+                                 double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                 int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                 HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               );
+int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                );
+int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  );
+int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   );
+
+void gather_u(int_t num_u_blks,
+              Ublock_info_t *Ublock_info, int_t * usub,
+              double *uval,  double *bigU,  int_t ldu,
+              int_t *xsup, int_t klst                /* for SuperSize */
+             );
+
+void gather_l( int_t num_LBlk, int_t knsupc,
+               Remain_info_t *L_info,
+               double * lval, int_t LD_lval,
+               double * L_buff );
+#endif
diff --git a/SRC/scommunication_aux.c b/SRC/scommunication_aux.c
new file mode 100644
index 00000000..55e83851
--- /dev/null
+++ b/SRC/scommunication_aux.c
@@ -0,0 +1,504 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t sIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t sIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, float* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], MPI_FLOAT,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t sBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     float* uval, gridinfo_t *grid,
+		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, float* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, sLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_FLOAT, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t sIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, float* Uval_buf, sLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t sWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sISend_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, MPI_FLOAT, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, MPI_FLOAT, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sPackLBlock(int_t k, float* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, sLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    float *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(float) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t sISend_LDiagBlock(int_t k0, float *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, MPI_FLOAT, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sIRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, MPI_FLOAT, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIRecv_LDiagBlock(int_t k0, float *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, MPI_FLOAT, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t sIBcast_UDiagBlock(int_t k, float *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, MPI_FLOAT, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t sIBcast_LDiagBlock(int_t k, float *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, MPI_FLOAT, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t sUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/sdistribute.c b/SRC/sdistribute.c
new file mode 100644
index 00000000..964f7ce4
--- /dev/null
+++ b/SRC/sdistribute.c
@@ -0,0 +1,1652 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Distribute the matrix onto the 2D process mesh.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * </pre>
+ */
+#include "superlu_sdefs.h"
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * </pre>
+ */
+
+float
+sdistribute(fact_t fact, int_t n, SuperMatrix *A,
+            Glu_freeable_t *Glu_freeable,
+	    sLUstruct_t *LUstruct, gridinfo_t *grid)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
+          len, len1, nsupc;
+	int_t lib;  /* local block row number */
+	int_t nlb;  /* local block rows*/
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+	int_t ub,gik,iklrow,fnz;
+	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NCPformat *Astore;
+    float *a;
+    int_t *asub;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;
+    int_t *lsub, *xlsub, *usub, *usub1, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
+	int   *index1;        /* temporary pointer to array of int */
+    float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
+    float **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+	int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
+    float **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	int msgsize;
+
+    int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+
+    /*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
+
+    /*-- Counts to be used in lower triangular solve. --*/
+    int_t  *fmod;          /* Modification count for L-solve.        */
+    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nfsendx = 0;    /* Number of Xk I will send               */
+    int_t  kseen;
+
+    /*-- Counts to be used in upper triangular solve. --*/
+    int_t  *bmod;          /* Modification count for U-solve.        */
+    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
+    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int_t  *ilsum;         /* starting position of each supernode in
+			      the full array (local)                 */
+
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+	int_t *ActiveFlag;
+	int_t *ActiveFlagAll;
+	int_t Iactive;
+	int *ranks;
+	int_t *idxs;
+	int_t **nzrows;
+	double rseed;
+	int rank_cnt,rank_cnt_ref,Root;
+    float *dense, *dense_col; /* SPA */
+    float zero = 0.0;
+    int_t  ldaspa;     /* LDA of SPA */
+    int_t iword, sword;
+    float mem_use = 0.0;
+
+    int_t *mod_bit;
+    int_t *frecv, *brecv, *lloc;
+    float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    float **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+    double *SeedSTD_BC,*SeedSTD_RD;
+    int_t idx_indx,idx_lusup;
+    int_t nbrow;
+    int_t  ik, il, lk, rel, knsupc, idx_r;
+    int_t  lptr1_tmp, idx_i, idx_v,m, uu;
+    int_t nub;
+    int tag;
+
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t, t_u, t_l;
+    int_t u_blks;
+#endif
+
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = A->Store;
+    a        = Astore->nzval;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+//#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    sword = sizeof(float);
+//#endif
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter sdistribute()");
+#endif
+
+    if ( fact == SamePattern_SameRowPerm ) {
+        /* ---------------------------------------------------------------
+         * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
+         * --------------------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = floatCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+	Unnz = Llu->Unnz;
+
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*sword;
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+ 				index = Ufstnz_br_ptr[lb];
+ 				uval = Unzval_br_ptr[lb];
+ 				while (  (k = index[Urb_indptr[lb]]) < jb ) {
+ 				    /* Skip nonzero values in this block */
+ 				    Urb_length[lb] += index[Urb_indptr[lb]+1];
+ 				    /* Move pointer to the next block */
+ 				    Urb_indptr[lb] += UB_DESCRIPTOR
+ 					+ SuperSize( k );
+ 				}
+ 				/*assert(k == jb);*/
+ 				/* start fstnz */
+ 				istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+ 				len = Urb_length[lb];
+ 				fsupc1 = FstBlockC( gb+1 );
+ 				k = j - fsupc;
+ 				/* Sum the lengths of the leading columns */
+ 				for (jj = 0; jj < k; ++jj)
+				    len += fsupc1 - index[istart++];
+				/*assert(irow>=index[istart]);*/
+				uval[len + irow - index[istart]] = a[i];
+			    } else { /* in L; put in SPA first */
+  				irow = ilsum[lb] + irow - FstBlockC( gb );
+  				dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } else {
+        /* --------------------------------------------------
+         * FIRST TIME CREATING THE L AND U DATA STRUCTURE.
+         * -------------------------------------------------- */
+
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* No L and U data structures are available yet.
+	   We need to set up the L and U data structures and propagate
+	   the values of A into them.          */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+
+	if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+
+	for (i = 0; i < j; ++i) index1[i] = EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr =
+               (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+
+	mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword;
+
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+
+
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {
+		/* Add room for descriptors */
+		len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+		if ( !(index = intMalloc_dist(len1+1)) )
+		    ABORT("Malloc fails for Uindex[].");
+		Ufstnz_br_ptr[lb] = index;
+		if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) )
+		    ABORT("Malloc fails for Unzval_br_ptr[*][].");
+		mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+		mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+		index[0] = Ucbs[lb]; /* Number of column blocks */
+		index[1] = len;      /* Total length of nzval[] */
+		index[2] = len1;     /* Total length of index[] */
+		index[len1] = -1;    /* End marker */
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
+
+        mem_use -= 2.0*k * iword;
+
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if (!(dense=floatCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
+              *sp_ienv_dist(3)))))
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+#if ( PRNTlevel>=1 )
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*sword;
+#endif
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr = (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	if ( !(Lindval_loc_bc_ptr =
+				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
+	Lindval_loc_bc_ptr[k-1] = NULL;
+
+	if ( !(Linv_bc_ptr =
+				(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
+	}
+	if ( !(Uinv_bc_ptr =
+				(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
+	}
+	Linv_bc_ptr[k-1] = NULL;
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
+			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
+	ABORT("Malloc fails for Unnz[].");
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index[j];
+	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index = intMalloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index[j];
+
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
+		    for (i = xa_begin[j]; i < xa_end[j]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		}
+
+		jbrow = PROW( jb, grid );
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
+                        }
+			if ( myrow == pr ) {
+			    lb = LBi( gb, grid ); /* Local block number */
+			    index = Ufstnz_br_ptr[lb];
+			    uval = Unzval_br_ptr[lb];
+			    fsupc1 = FstBlockC( gb+1 );
+			    if (rb_marker[lb] <= jb) { /* First time see
+							  the block       */
+				rb_marker[lb] = jb + 1;
+				Urb_indptr[lb] = Urb_fstnz[lb];;
+				index[Urb_indptr[lb]] = jb; /* Descriptor */
+				Urb_indptr[lb] += UB_DESCRIPTOR;
+				/* Record the first location in index[] of the
+				   next block */
+				Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+				len = Urb_indptr[lb];/* Start fstnz in index */
+				index[len-1] = 0;
+				for (k = 0; k < nsupc; ++k)
+				    index[len+k] = fsupc1;
+				if ( gb != jb )/* Exclude diagonal block. */
+				    ++bmod[lb];/* Mod. count for back solve */
+				if ( kseen == 0 && myrow != jbrow ) {
+				    ++nbrecvx;
+				    kseen = 1;
+				}
+			    } else { /* Already saw the block */
+				len = Urb_indptr[lb];/* Start fstnz in index */
+			    }
+			    jj = j - fsupc;
+			    index[len+jj] = irow;
+			    /* Load the numerical values */
+			    k = fsupc1 - irow; /* No. of nonzeros in segment */
+			    index[len-1] += k; /* Increment block length in
+						  Descriptor */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (ii = 0; ii < k; ++ii) {
+				uval[Urb_length[lb]++] = dense_col[irow + ii];
+				dense_col[irow + ii] = zero;
+			    }
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
+                    }
+		    if ( myrow == pr ) {
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
+#endif
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		if ( nrbl ) { /* Do not ensure the blocks are sorted! */
+		    /* Set up the initial pointers for each block in
+		       index[] and nzval[]. */
+		    /* Add room for descriptors */
+		    len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			if ( !(index = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index[]");
+			if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+				ABORT("Malloc fails for lusup[]");
+			if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
+				ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
+			if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+				ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+			if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+				ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+		    mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+		    mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+		    mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+		    index[0] = nrbl;  /* Number of row blocks */
+		    index[1] = len;   /* LDA of the nzval[] */
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (k = 0; k < nrbl; ++k) {
+			gb = Lrb_number[k];
+			lb = LBi( gb, grid );
+			len = Lrb_length[lb];
+			Lindval_loc_bc_ptr[ljb][k] = lb;
+			Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
+			Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
+			Lrb_length[lb] = 0;  /* Reset vector of block length */
+			index[next_lind++] = gb; /* Descriptor */
+			index[next_lind++] = len;
+			Lrb_indptr[lb] = next_lind;
+			Lrb_valptr[lb] = next_lval;
+			next_lind += len;
+			next_lval += len;
+		    }
+		    /* Propagate the compressed row subscripts to Lindex[], and
+		       the initial values of A from SPA into Lnzval[]. */
+
+		    len = index[1];  /* LDA of lusup[] */
+		    for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    k = Lrb_indptr[lb]++; /* Random access a block */
+			    index[k] = irow;
+			    k = Lrb_valptr[lb]++;
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			}
+		    } /* for i ... */
+			Lrowind_bc_ptr[ljb] = index;
+			Lnzval_bc_ptr[ljb] = lusup;
+
+
+			/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/
+			if(nrbl>1){
+				krow = PROW( jb, grid );
+				if(myrow==krow){ /* skip the diagonal block */
+					uu=nrbl-2;
+					lloc = &Lindval_loc_bc_ptr[ljb][1];
+				}else{
+					uu=nrbl-1;
+					lloc = Lindval_loc_bc_ptr[ljb];
+				}
+				quickSortM(lloc,0,uu,nrbl,0,3);
+			}
+
+
+			if ( !(index_srt = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index_srt[]");
+			if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+				ABORT("Malloc fails for lusup_srt[]");
+
+			idx_indx = BC_HEADER;
+			idx_lusup = 0;
+			for (jj=0;jj<BC_HEADER;jj++)
+				index_srt[jj] = index[jj];
+
+			for(i=0;i<nrbl;i++){
+				nbrow = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+1];
+				for (jj=0;jj<LB_DESCRIPTOR+nbrow;jj++){
+					index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
+				}
+
+				Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
+
+				for (jj=0;jj<nbrow;jj++){
+					k=idx_lusup;
+					k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
+					for (j = 0; j < nsupc; ++j) {
+						lusup_srt[k] = lusup[k1];
+						k += len;
+						k1 += len;
+					}
+					idx_lusup++;
+				}
+				Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
+			}
+
+			SUPERLU_FREE(lusup);
+			SUPERLU_FREE(index);
+
+			Lrowind_bc_ptr[ljb] = index_srt;
+			Lnzval_bc_ptr[ljb] = lusup_srt;
+
+			// if(ljb==0)
+			// for (jj=0;jj<nrbl*3;jj++){
+			// printf("iam %5d Lindval %5d\n",iam, Lindval_loc_bc_ptr[ljb][jj]);
+			// fflush(stdout);
+			// }
+			// for (jj=0;jj<nrbl;jj++){
+			// printf("iam %5d Lindval %5d\n",iam, index[Lindval_loc_bc_ptr[ljb][jj+nrbl]]);
+			// fflush(stdout);
+
+			// }
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+			Linv_bc_ptr[ljb] = NULL;
+			Uinv_bc_ptr[ljb] = NULL;
+			Lindval_loc_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	/////////////////////////////////////////////////////////////////
+
+	/* Set up additional pointers for the index and value arrays of U.
+	   nub is the number of local block columns. */
+	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
+	if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+		ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+							 blocks in a block column. */
+	Urbs1 = Urbs + nub;
+	if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+		ABORT("Malloc fails for Ucb_indptr[]");
+	if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+		ABORT("Malloc fails for Ucb_valptr[]");
+	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
+
+	/* Count number of row blocks in a block column.
+	   One pass of the skeleton graph of U. */
+	for (lk = 0; lk < nlb; ++lk) {
+		usub1 = Ufstnz_br_ptr[lk];
+		if ( usub1 ) { /* Not an empty block row. */
+			/* usub1[0] -- number of column blocks in this block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];            /* Global block number */
+				++Urbs[LBj(k,grid)];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+	/* Set up the vertical linked lists for the row blocks.
+	   One pass of the skeleton graph of U. */
+	for (lb = 0; lb < nub; ++lb) {
+		if ( Urbs[lb] ) { /* Not an empty block column. */
+			if ( !(Ucb_indptr[lb]
+						= SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+				ABORT("Malloc fails for Ucb_indptr[lb][]");
+			if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+				ABORT("Malloc fails for Ucb_valptr[lb][]");
+		}
+	}
+	for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+		usub1 = Ufstnz_br_ptr[lk];
+		if ( usub1 ) { /* Not an empty block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			j = 0;         /* Pointer in nzval array. */
+
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];          /* Global block number, column-wise. */
+				ljb = LBj( k, grid ); /* Local block number, column-wise. */
+				Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+
+				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+				Ucb_valptr[ljb][Urbs1[ljb]] = j;
+
+				++Urbs1[ljb];
+				j += usub1[i+1];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+
+/* Count the nnzs per block column */
+	for (lb = 0; lb < nub; ++lb) {
+		Unnz[lb] = 0;
+		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
+		knsupc = SuperSize( k );
+		for (ub = 0; ub < Urbs[lb]; ++ub) {
+			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
+			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iklrow = FstBlockC( gik+1 );
+			for (jj = 0; jj < knsupc; ++jj) {
+				fnz = Ufstnz_br_ptr[ik][i + jj];
+				if ( fnz < iklrow ) {
+					Unnz[lb] +=iklrow-fnz;
+				}
+			} /* for jj ... */
+		}
+	}
+
+	/////////////////////////////////////////////////////////////////
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Bcast tree for L ... */
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		ABORT("Malloc fails for LBtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
+
+	for (i=0;i<k;i++){
+		SeedSTD_BC[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+	for (ljb = 0; ljb <k ; ++ljb) {
+		LBtree_ptr[ljb]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=3*nsupers;
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+			irow = lsub[i];
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb);
+		} /* for j ... */
+		}
+	}
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+		Root=-1;
+		Iactive = 0;
+		for (j=0;j<grid->nprow;++j){
+			if(ActiveFlag[j]!=3*nsupers){
+			gb = ActiveFlag[j];
+			pr = PROW( gb, grid );
+			if(gb==jb)Root=pr;
+			if(myrow==pr)Iactive=1;
+			}
+		}
+
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2);
+
+		if(Iactive==1){
+			// printf("jb %5d damn\n",jb);
+			// fflush(stdout);
+			assert( Root>-1 );
+			rank_cnt = 1;
+			ranks[0]=Root;
+			for (j = 0; j < grid->nprow; ++j){
+				if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+					++rank_cnt;
+				}
+			}
+
+			if(rank_cnt>1){
+
+				for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+					ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+				// rseed=rand();
+				// rseed=1.0;
+				msgsize = SuperSize( jb );
+				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+
+				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+				// fflush(stdout);
+
+				// if(iam==15 || iam==3){
+				// printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s'));
+				// fflush(stdout);
+				// }
+
+				// #if ( PRNTlevel>=1 )
+				if(Root==myrow){
+					rank_cnt_ref=1;
+					for (j = 0; j < grid->nprow; ++j) {
+						if ( fsendx_plist[ljb][j] != EMPTY ) {
+							++rank_cnt_ref;
+						}
+					}
+					assert(rank_cnt==rank_cnt_ref);
+
+					// printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt);
+
+					// // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt);
+					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// // printf("\n");
+				}
+				// #endif
+			}
+		}
+		}
+	}
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
+#endif
+
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Reduce tree for L ... */
+	/* the following is used as reference */
+	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(mod_bit = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for mod_bit[].");
+	if ( !(frecv = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for frecv[].");
+
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+		pr = PROW( k, grid );
+		if ( myrow == pr ) {
+			lib = LBi( k, grid );    /* local block number */
+			kcol = PCOL( k, grid );
+			if (mycol == kcol || fmod[lib] )
+				mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+		}
+	}
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		ABORT("Malloc fails for LRtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+
+	// if ( !(idxs = intCalloc_dist(nsupers)) )
+		// ABORT("Calloc fails for idxs[].");
+
+	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+		// ABORT("Malloc fails for nzrows[].");
+
+	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_RD[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_RD[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+
+	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		// fsupc = FstBlockC( jb );
+		// len=xlsub[fsupc+1]-xlsub[fsupc];
+		// idxs[jb] = len-1;
+		// if(len>0){
+			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
+				// ABORT("Malloc fails for nzrows[jb]");
+			// for(i=xlsub[fsupc];i<xlsub[fsupc+1];++i){
+				// irow = lsub[i];
+				// nzrows[jb][i-xlsub[fsupc]]=irow;
+			// }
+			// quickSort(nzrows[jb],0,len-1,0);
+		// }
+		// else{
+			// nzrows[jb] = NULL;
+		// }
+	// }
+
+
+	for (lib = 0; lib <k ; ++lib) {
+		LRtree_ptr[lib]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		fsupc = FstBlockC( jb );
+		pc = PCOL( jb, grid );
+		for(i=xlsub[fsupc];i<xlsub[fsupc+1];++i){
+			irow = lsub[i];
+			ib = BlockNum( irow );
+			pr = PROW( ib, grid );
+			if ( myrow == pr ) { /* Block row ib in my process row */
+				lib = LBi( ib, grid ); /* Local block number */
+				ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb);
+			}
+		}
+	}
+
+
+	for (lib=0;lib<k;++lib){
+		ib = myrow+lib*grid->nprow;  /* not sure */
+		if(ib<nsupers){
+			pr = PROW( ib, grid );
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+			Root=-1;
+			Iactive = 0;
+
+			for (j=0;j<grid->npcol;++j){
+				if(ActiveFlag[j]!=-3*nsupers){
+				jb = ActiveFlag[j];
+				pc = PCOL( jb, grid );
+				if(jb==ib)Root=pc;
+				if(mycol==pc)Iactive=1;
+				}
+			}
+
+
+			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2);
+
+			if(Iactive==1){
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->npcol; ++j){
+					if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+						++rank_cnt;
+					}
+				}
+				if(rank_cnt>1){
+
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( ib );
+
+					// if(ib==0){
+
+					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					// }
+
+					// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
+					// fflush(stdout);
+
+					// if(ib==15  || ib ==16){
+
+					// if(iam==15 || iam==3){
+					// printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s'));
+					// fflush(stdout);
+					// }
+
+
+					// #if ( PRNTlevel>=1 )
+					// if(Root==mycol){
+					// assert(rank_cnt==frecv[lib]);
+					// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+					// // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+					// // // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// // printf("\n");
+					// }
+					// #endif
+				}
+			}
+		}
+	}
+
+	SUPERLU_FREE(mod_bit);
+	SUPERLU_FREE(frecv);
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
+	// for(i=0;i<nsupers;++i){
+		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+	// }
+	// SUPERLU_FREE(nzrows);
+
+		////////////////////////////////////////////////////////
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+
+	/* construct the Bcast tree for U ... */
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		ABORT("Malloc fails for UBtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+	if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_BC[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_BC[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
+
+
+	for (ljb = 0; ljb <k ; ++ljb) {
+		UBtree_ptr[ljb]=NULL;
+	}
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->nprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+
+		fsupc = FstBlockC( jb );
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			istart = xusub[j];
+			/* NOTE: Only the first nonzero index of the segment
+			   is stored in usub[]. */
+			for (i = istart; i < xusub[j+1]; ++i) {
+				irow = usub[i]; /* First nonzero in the segment. */
+				gb = BlockNum( irow );
+				pr = PROW( gb, grid );
+				ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb);
+			// printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers);
+			// fflush(stdout);
+				//if(gb==jb)Root=pr;
+			}
+
+
+		}
+		pr = PROW( jb, grid ); // take care of diagonal node stored as L
+		// printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]);
+		// fflush(stdout);
+		ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb);
+		}
+	}
+
+
+
+	for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */
+		jb = mycol+ljb*grid->npcol;  /* not sure */
+		if(jb<nsupers){
+		pc = PCOL( jb, grid );
+		// if ( mycol == pc ) { /* Block column jb in my process column */
+
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow];
+		for (j=0;j<grid->nprow;++j)ActiveFlag[j+grid->nprow]=j;
+		for (j=0;j<grid->nprow;++j)ranks[j]=-1;
+
+		Root=-1;
+		Iactive = 0;
+		for (j=0;j<grid->nprow;++j){
+			if(ActiveFlag[j]!=-3*nsupers){
+			gb = ActiveFlag[j];
+			pr = PROW( gb, grid );
+			if(gb==jb)Root=pr;
+			if(myrow==pr)Iactive=1;
+			}
+		}
+
+		quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2);
+	// printf("jb: %5d Iactive %5d\n",jb,Iactive);
+	// fflush(stdout);
+		if(Iactive==1){
+			// printf("root:%5d jb: %5d\n",Root,jb);
+			// fflush(stdout);
+			assert( Root>-1 );
+			rank_cnt = 1;
+			ranks[0]=Root;
+			for (j = 0; j < grid->nprow; ++j){
+				if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){
+					ranks[rank_cnt]=ActiveFlag[j+grid->nprow];
+					++rank_cnt;
+				}
+			}
+	// printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt);
+	// fflush(stdout);
+			if(rank_cnt>1){
+				for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+					ranks[ii] = PNUM( ranks[ii], pc, grid );
+
+				// rseed=rand();
+				// rseed=1.0;
+				msgsize = SuperSize( jb );
+				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+
+				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
+				// fflush(stdout);
+
+				if(Root==myrow){
+				rank_cnt_ref=1;
+				for (j = 0; j < grid->nprow; ++j) {
+					// printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow);
+					// fflush(stdout);
+					if ( bsendx_plist[ljb][j] != EMPTY ) {
+						++rank_cnt_ref;
+					}
+				}
+				// printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref);
+				// fflush(stdout);
+				assert(rank_cnt==rank_cnt_ref);
+				}
+			}
+		}
+		}
+	}
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	SUPERLU_FREE(SeedSTD_BC);
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
+#endif
+
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+	/* construct the Reduce tree for U ... */
+	/* the following is used as reference */
+	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(mod_bit = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for mod_bit[].");
+	if ( !(brecv = intMalloc_dist(nlb)) )
+		ABORT("Malloc fails for brecv[].");
+
+	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
+	for (k = 0; k < nsupers; ++k) {
+		pr = PROW( k, grid );
+		if ( myrow == pr ) {
+			lib = LBi( k, grid );    /* local block number */
+			kcol = PCOL( k, grid );
+			if (mycol == kcol || bmod[lib] )
+				mod_bit[lib] = 1;  /* contribution from off-diagonal and diagonal*/
+		}
+	}
+	/* Every process receives the count, but it is only useful on the
+	   diagonal processes.  */
+	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+
+
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		ABORT("Malloc fails for URtree_ptr[].");
+	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
+		ABORT("Calloc fails for ActiveFlag[].");
+	if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) )
+		ABORT("Malloc fails for ranks[].");
+
+	// if ( !(idxs = intCalloc_dist(nsupers)) )
+		// ABORT("Calloc fails for idxs[].");
+
+	// if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) )
+		// ABORT("Malloc fails for nzrows[].");
+
+	if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) )
+		ABORT("Malloc fails for SeedSTD_RD[].");
+
+	for (i=0;i<k;i++){
+		SeedSTD_RD[i]=rand();
+	}
+
+	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
+
+
+	// for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		// fsupc = FstBlockC( jb );
+		// len=0;
+		// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			// istart = xusub[j];
+			// /* NOTE: Only the first nonzero index of the segment
+			   // is stored in usub[]. */
+			// len +=  xusub[j+1] - xusub[j];
+		// }
+
+		// idxs[jb] = len-1;
+
+		// if(len>0){
+			// if ( !(nzrows[jb] = intMalloc_dist(len)) )
+				// ABORT("Malloc fails for nzrows[jb]");
+
+			// fsupc = FstBlockC( jb );
+
+			// len=0;
+
+			// for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+				// istart = xusub[j];
+				// /* NOTE: Only the first nonzero index of the segment
+				   // is stored in usub[]. */
+				// for (i = istart; i < xusub[j+1]; ++i) {
+					// irow = usub[i]; /* First nonzero in the segment. */
+					// nzrows[jb][len]=irow;
+					// len++;
+				// }
+			// }
+			// quickSort(nzrows[jb],0,len-1,0);
+		// }
+		// else{
+			// nzrows[jb] = NULL;
+		// }
+	// }
+
+
+	for (lib = 0; lib <k ; ++lib) {
+		URtree_ptr[lib]=NULL;
+	}
+
+
+	if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) )
+		ABORT("Calloc fails for ActiveFlagAll[].");
+	for (j=0;j<grid->npcol*k;++j)ActiveFlagAll[j]=3*nsupers;
+
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+		fsupc = FstBlockC( jb );
+		pc = PCOL( jb, grid );
+
+		fsupc = FstBlockC( jb );
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+			istart = xusub[j];
+			/* NOTE: Only the first nonzero index of the segment
+			   is stored in usub[]. */
+			for (i = istart; i < xusub[j+1]; ++i) {
+				irow = usub[i]; /* First nonzero in the segment. */
+				ib = BlockNum( irow );
+				pr = PROW( ib, grid );
+				if ( myrow == pr ) { /* Block row ib in my process row */
+					lib = LBi( ib, grid ); /* Local block number */
+					ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+				}
+			}
+		}
+
+		pr = PROW( jb, grid );
+		if ( myrow == pr ) { /* Block row ib in my process row */
+			lib = LBi( jb, grid ); /* Local block number */
+			ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb);
+		}
+	}
+
+
+	for (lib=0;lib<k;++lib){
+		ib = myrow+lib*grid->nprow;  /* not sure */
+		if(ib<nsupers){
+			pr = PROW( ib, grid );
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];;
+			for (j=0;j<grid->npcol;++j)ActiveFlag[j+grid->npcol]=j;
+			for (j=0;j<grid->npcol;++j)ranks[j]=-1;
+			Root=-1;
+			Iactive = 0;
+
+			for (j=0;j<grid->npcol;++j){
+				if(ActiveFlag[j]!=3*nsupers){
+				jb = ActiveFlag[j];
+				pc = PCOL( jb, grid );
+				if(jb==ib)Root=pc;
+				if(mycol==pc)Iactive=1;
+				}
+			}
+
+			quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2);
+
+			if(Iactive==1){
+				assert( Root>-1 );
+				rank_cnt = 1;
+				ranks[0]=Root;
+				for (j = 0; j < grid->npcol; ++j){
+					if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){
+						ranks[rank_cnt]=ActiveFlag[j+grid->npcol];
+						++rank_cnt;
+					}
+				}
+				if(rank_cnt>1){
+
+					for (ii=0;ii<rank_cnt;ii++)   // use global ranks rather than local ranks
+						ranks[ii] = PNUM( pr, ranks[ii], grid );
+
+					// rseed=rand();
+					// rseed=1.0;
+					msgsize = SuperSize( ib );
+
+					// if(ib==0){
+
+					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					// }
+
+					// #if ( PRNTlevel>=1 )
+					if(Root==mycol){
+					// printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]);
+					// fflush(stdout);
+					assert(rank_cnt==brecv[lib]);
+					// printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt);
+					// printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt);
+					// // for(j=0;j<rank_cnt;++j)printf("%4d",ranks[j]);
+					// printf("\n");
+					}
+					// #endif
+				}
+			}
+		}
+	}
+	SUPERLU_FREE(mod_bit);
+	SUPERLU_FREE(brecv);
+
+
+	SUPERLU_FREE(ActiveFlag);
+	SUPERLU_FREE(ActiveFlagAll);
+	SUPERLU_FREE(ranks);
+	// SUPERLU_FREE(idxs);
+	SUPERLU_FREE(SeedSTD_RD);
+	// for(i=0;i<nsupers;++i){
+		// if(nzrows[i])SUPERLU_FREE(nzrows[i]);
+	// }
+	// SUPERLU_FREE(nzrows);
+
+#if ( PROFlevel>=1 )
+t = SuperLU_timer_() - t;
+if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
+#endif
+
+	////////////////////////////////////////////////////////
+
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->Unnz = Unnz;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	Llu->LRtree_ptr = LRtree_ptr;
+	Llu->LBtree_ptr = LBtree_ptr;
+	Llu->URtree_ptr = URtree_ptr;
+	Llu->UBtree_ptr = UBtree_ptr;
+	Llu->Linv_bc_ptr = Linv_bc_ptr;
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
+
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
+#endif
+
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm);
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
+#endif
+
+    } /* else fact != SamePattern_SameRowPerm */
+
+#if ( DEBUGlevel>=1 )
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit sdistribute()");
+#endif
+
+    return (mem_use);
+} /* SDISTRIBUTE */
+
diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c
new file mode 100644
index 00000000..db73cb4d
--- /dev/null
+++ b/SRC/sec_structs.c
@@ -0,0 +1,665 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Auxiliary routines in 3D algorithms
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+
+
+//#include <mm_malloc.h>
+#include "superlu_ddefs.h"
+#if 0
+#include "sec_structs.h"
+#include <stdlib.h> /*for Qsort */
+#include <mpi.h>
+#include <math.h>   /*for sqrt*/
+#include <string.h>
+#include "compiler.h"
+//#include "load-balance/supernodal_etree.h"
+#include "supernodal_etree.h"
+#endif
+
+#include <stdio.h> /*for printfs*/
+#include <stdlib.h> /*for getline*/
+
+double CPU_CLOCK_RATE;
+/*for sorting structures */
+int Cmpfunc_R_info (const void * a, const void * b)
+{
+    return ( ((Remain_info_t*)a)->nrows - ((Remain_info_t*)b)->nrows );
+}
+
+  
+
+int Cmpfunc_U_info (const void * a, const void * b)
+{
+    return ( ((Ublock_info_t*)a)->ncols - ((Ublock_info_t*)b)->ncols );
+}
+
+
+int sort_R_info( Remain_info_t* Remain_info, int n )
+{
+    qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info);
+
+    return 0;
+} 
+
+int sort_U_info( Ublock_info_t* Ublock_info, int n )
+{
+    qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info);
+
+    return 0;
+}
+
+int Cmpfunc_R_info_elm (const void * a, const void * b)
+{
+    return ( ((Remain_info_t*)a)->eo - ((Remain_info_t*)b)->eo );
+}
+
+
+
+int Cmpfunc_U_info_elm (const void * a, const void * b)
+{
+    return ( ((Ublock_info_t*)a)->eo - ((Ublock_info_t*)b)->eo );
+}
+
+
+
+int sort_R_info_elm( Remain_info_t* Remain_info, int n )
+{
+    /*sorts on the basis of order of elimination*/
+    qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info_elm);
+
+    return 0;
+}
+
+int sort_U_info_elm( Ublock_info_t* Ublock_info, int n )
+{
+    qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info_elm);
+
+    return 0;
+}
+
+double *SCT_ThreadVarInit(int_t num_threads)
+{
+#if 0
+    double *var = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64);
+#else
+    double *var = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE);
+#endif
+    for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i)
+    {
+        var[i] = 0.0;
+    }
+    return var;
+}
+
+
+#define DEFAULT_CPU_FREQ 3000.0   // 3 GHz
+
+double getFreq(void)
+{
+    FILE *fp = fopen("/proc/cpuinfo", "rb");
+    if(!fp) {
+	// the file /proc/cpuinfo doesn't exists, return 3000 Mhz as the frequency
+#if ( PRNTlevel>=2 )
+	printf("/proc/cpuinfo doesn't exists, using 3GHz as CPU frequency. Some timers will not be correct\n");
+#endif
+	return DEFAULT_CPU_FREQ;
+    }
+
+    char *arg = 0;
+#if 1
+    size_t len = 0;
+    char *line = NULL;
+#else
+    size_t len = 100;   // Sherry fix
+    char *line = SUPERLU_MALLOC(len * sizeof(char));
+#endif
+    size_t read;
+    while ((read = getline(&line, &len, fp)) != -1)
+    {
+        // printf("%s", line);
+        char * pch;
+        pch = strtok (line, " \t:");
+        if (pch != NULL && strcmp(pch, "cpu") == 0)
+        {
+
+            /* code */
+            pch = strtok (NULL, " \t:");
+            // printf("%s\n", pch );
+            if (pch != NULL && strcmp(pch, "MHz") == 0)
+            {
+                pch = strtok (NULL, " \t:");
+                double freq = atof(pch);
+                free(arg);
+                fclose(fp);
+                return freq;
+
+                break;
+            }
+        }
+	free(line);
+	line = NULL;
+    }
+
+    //SUPERLU_FREE(line); // sherry added
+    return 0;
+}
+
+/* Initialize various counters. */
+void SCT_init(SCT_t* SCT)
+{
+#if 1
+    CPU_CLOCK_RATE = getFreq() * 1e-3;
+#else
+    CPU_CLOCK_RATE = 3000. * 1e-3;
+#endif
+    int num_threads = 1;
+
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    {
+        #pragma omp master
+        {
+            num_threads = omp_get_num_threads ();
+        }
+    }
+#endif
+
+    SCT->acc_load_imbal = 0.0;
+
+    /* Counter for couting memory operations */
+    SCT->scatter_mem_op_counter   = 0.0;
+    SCT->scatter_mem_op_timer     = 0.0;
+#ifdef SCATTER_PROFILE
+    SCT->Host_TheadScatterMOP = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64);
+    SCT->Host_TheadScatterTimer = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64);
+    memset(SCT->Host_TheadScatterMOP, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8)));
+    memset(SCT->Host_TheadScatterTimer, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8)));
+#endif
+
+    SCT->LookAheadRowSepTimer = 0.0;
+    SCT->LookAheadRowSepMOP   = 0.0;
+    SCT->GatherTimer          = 0.0;
+    SCT->GatherMOP            = 0.0;
+    SCT->LookAheadGEMMTimer   = 0.0;
+    SCT->LookAheadGEMMFlOp    = 0.0;
+    SCT->LookAheadScatterTimer   = 0.0;
+    SCT->LookAheadScatterMOP   = 0.0;
+    SCT->AssemblyTimer        = 0.0;
+
+    SCT->offloadable_flops = 0.0;
+    SCT->offloadable_mops = 0.0;
+
+#if 0
+    SCT->SchurCompUdtThreadTime = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64);
+#else
+    SCT->SchurCompUdtThreadTime = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE);
+#endif
+
+    for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i)
+    {
+        SCT->SchurCompUdtThreadTime[i] = 0.0;
+    }
+
+    SCT->schur_flop_counter = 0.0;
+    SCT->schur_flop_timer = 0.0;
+
+    SCT->datatransfer_timer = 0;
+    SCT->schurPhiCallTimer = 0;
+    SCT->schurPhiCallCount = 0;
+    SCT->datatransfer_count = 0;
+    SCT->PhiWaitTimer = 0;
+    SCT->PhiWaitTimer_2 = 0;
+    SCT->NetSchurUpTimer = 0;
+    SCT->PhiMemCpyTimer = 0;
+    SCT->PhiMemCpyCounter = 0;
+
+    SCT->pdgstrs2_timer = 0.0;
+    SCT->trf2_flops = 0;
+    SCT->trf2_time = 0;
+    SCT->CPUOffloadTimer = 0;
+    SCT->pdgstrf2_timer = 0.0;
+    SCT->lookaheadupdatetimer = 0;
+
+    /* diagonal block factorization; part of pdgstrf2*/
+    // SCT->Local_Dgstrf2_tl = 0;
+    SCT->Local_Dgstrf2_Thread_tl = SCT_ThreadVarInit(num_threads);
+    /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */
+    SCT->Wait_UDiagBlock_Recv_tl = 0;
+    /*wait for receiving L diagonal block: part of mpf*/
+    SCT->Wait_LDiagBlock_Recv_tl = 0;
+    SCT->Recv_UDiagBlock_tl = 0;
+    /*wait for previous U block send to finish; part of pdgstrf2 */
+    SCT->Wait_UDiagBlockSend_tl = 0;
+    /*after obtaining U block, time spent in calculating L panel;part of pdgstrf2*/
+    SCT->L_PanelUpdate_tl = 0;
+    /*Synchronous Broadcasting U panel*/
+    SCT->Bcast_UPanel_tl = 0;
+    SCT->Bcast_LPanel_tl = 0;
+    /*Wait for L send to finish */
+    SCT->Wait_LSend_tl = 0;
+
+    /*Wait for U send to finish */
+    SCT->Wait_USend_tl = 0;
+    /*Wait for U receive */
+    SCT->Wait_URecv_tl = 0;
+    /*Wait for L receive */
+    SCT->Wait_LRecv_tl = 0;
+
+    /*U_panelupdate*/
+    SCT->PDGSTRS2_tl = 0;
+
+    /*profiling by phases*/
+    SCT->Phase_Factor_tl = 0;
+    SCT->Phase_LU_Update_tl = 0;
+    SCT->Phase_SC_Update_tl = 0;
+
+    /*time to get the lock*/
+    SCT->GetAijLock_Thread_tl = SCT_ThreadVarInit(num_threads);
+
+    /*3d timers*/
+    SCT->ancsReduce = 0.0;
+    SCT->gatherLUtimer = 0.0;
+
+    for (int i = 0; i < MAX_3D_LEVEL; ++i)
+    {
+        /* code */
+        SCT->tFactor3D[i] = 0;
+        SCT->tSchCompUdt3d[i] = 0;
+    }
+
+    SCT->tAsyncPipeTail = 0.0; 
+    SCT->tStartup =0.0;
+
+    SCT->commVolFactor =0.0;
+    SCT->commVolRed =0.0;
+} /* SCT_init */
+
+void SCT_free(SCT_t* SCT)
+{
+#ifdef SCATTER_PROFILE
+    free(SCT->Host_TheadScatterMOP);
+    free(SCT->Host_TheadScatterTimer);
+#endif
+#if 0
+    _mm_free(SCT->SchurCompUdtThreadTime);
+    _mm_free(SCT->Local_Dgstrf2_Thread_tl);
+    _mm_free(SCT->GetAijLock_Thread_tl);
+#else
+    SUPERLU_FREE(SCT->SchurCompUdtThreadTime);
+    SUPERLU_FREE(SCT->Local_Dgstrf2_Thread_tl);
+    SUPERLU_FREE(SCT->GetAijLock_Thread_tl);
+#endif
+    SUPERLU_FREE(SCT); // sherry added
+}
+
+
+void DistPrint(char* function_name,  double value, char* Units, gridinfo_t* grid)
+/*
+Prints average of the value across all the MPI ranks;
+Displays as function_name  \t value \t units;
+*/
+{
+    int iam = grid->iam;
+    int num_procs = grid->nprow * grid->npcol;
+    double sum;
+    double min = 0;
+    double max = 0;
+    double value_squared = value * value;
+    double sum_value_squared;
+
+    MPI_Reduce( &value, &sum,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    MPI_Reduce( &value, &min,  1, MPI_DOUBLE, MPI_MIN, 0, grid->comm );
+    MPI_Reduce( &value, &max,  1, MPI_DOUBLE, MPI_MAX, 0, grid->comm );
+    MPI_Reduce( &value_squared, &sum_value_squared,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs);
+    if (!iam)
+    {
+        printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name,
+               sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units );
+        // printf("%s \t %lf %s\n", function_name, value, Units );
+    }
+
+}
+
+void DistPrint3D(char* function_name,  double value, char* Units, gridinfo3d_t* grid3d)
+/*
+Prints average of the value across all the MPI ranks;
+Displays as function_name  \t value \t units;
+*/
+{
+    int iam = grid3d->iam;
+    int num_procs = grid3d->nprow * grid3d->npcol * grid3d->npdep;
+    double sum;
+    double min = 0;
+    double max = 0;
+    double value_squared = value * value;
+    double sum_value_squared;
+
+    MPI_Reduce( &value, &sum,  1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm );
+    MPI_Reduce( &value, &min,  1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm );
+    MPI_Reduce( &value, &max,  1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm );
+    MPI_Reduce( &value_squared, &sum_value_squared,  1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm );
+    double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs);
+    if (!iam)
+    {
+        printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name,
+               sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units );
+        // printf("%s \t %lf %s\n", function_name, value, Units );
+    }
+
+}
+
+void DistPrintMarkupHeader(char* headerTitle,  double value,  gridinfo_t* grid)
+{
+
+    int iam = grid->iam;
+    int num_procs = grid->nprow * grid->npcol;
+    double sum;
+    double min = 0;
+    double max = 0;
+    double value_squared = value * value;
+    double sum_value_squared;
+
+    MPI_Reduce( &value, &sum,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    MPI_Reduce( &value, &min,  1, MPI_DOUBLE, MPI_MIN, 0, grid->comm );
+    MPI_Reduce( &value, &max,  1, MPI_DOUBLE, MPI_MAX, 0, grid->comm );
+    MPI_Reduce( &value_squared, &sum_value_squared,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+
+    if (!iam)
+    {
+        printf("#### %s : %10.4f \n\n", headerTitle,sum / num_procs );
+        printf("|Function name \t| avg \t| min \t| max \t| std-dev| units|\n");
+        printf("|---|---|---|---|---|---|\n");
+        // printf("%s \t %lf %s\n", function_name, value, Units );
+    }
+
+}
+void DistPrintThreaded(char* function_name, double* value, double Norm, int_t num_threads, char* Units, gridinfo_t* grid)
+/*
+Prints average of the value across all the MPI ranks, for threaded variables;
+First averages over all the threads;
+Norm is normalizing constant
+Displays as function_name  \t value \t units;
+*/
+{
+    int iam = grid->iam;
+    int num_procs = grid->nprow * grid->npcol;
+    double local_sum = 0;
+    for (int i = 0; i < num_threads ; ++i)
+    {
+        local_sum += value[i * CACHE_LINE_SIZE];
+    }
+
+    local_sum = local_sum / (Norm * num_threads);
+    double sum;
+    double min = 0;
+    double max = 0;
+    double value_squared = local_sum * local_sum;
+    double sum_value_squared;
+
+    MPI_Reduce( &local_sum, &sum,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    MPI_Reduce( &local_sum, &min,  1, MPI_DOUBLE, MPI_MIN, 0, grid->comm );
+    MPI_Reduce( &local_sum, &max,  1, MPI_DOUBLE, MPI_MAX, 0, grid->comm );
+    MPI_Reduce( &value_squared, &sum_value_squared,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs);
+    if (!iam)
+    {
+        printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%% %s|\n", function_name,
+               sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units );
+        // printf("%s \t %lf %s\n", function_name, value, Units );
+    }
+}
+
+
+/*for mkl_get_blocks_frequency*/
+// #include "mkl.h"
+void SCT_print(gridinfo_t *grid, SCT_t* SCT)
+{
+    int num_threads = 1;
+
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+    {
+        #pragma omp master
+        {
+            num_threads = omp_get_num_threads ();
+        }
+    }
+#endif
+    CPU_CLOCK_RATE = 1e9 * CPU_CLOCK_RATE;
+
+    int iam = grid->iam;
+    int_t num_procs = grid->npcol * grid->nprow;
+    double temp_holder;
+    MPI_Reduce( &SCT->NetSchurUpTimer, &temp_holder,  1, MPI_DOUBLE, MPI_SUM, 0, grid->comm );
+    if (!iam)
+    {
+        printf("CPU_CLOCK_RATE  %.1f\n", CPU_CLOCK_RATE );
+        printf("Total time in factorization \t: %5.2lf\n", SCT->pdgstrfTimer);
+        printf("MPI-communication phase \t: %5.2lf\n", SCT->pdgstrfTimer - (temp_holder / num_procs));
+
+    }
+
+    /* Printing Panel factorization profile*/
+    // double CPU_CLOCK_RATE = 1e9 * mkl_get_clocks_frequency();
+
+
+    // DistPrint("Local_Dgstrf2", SCT->Local_Dgstrf2_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrintThreaded(
+    //     "Local_Dgstrf2         ", SCT->Local_Dgstrf2_Thread_tl, CPU_CLOCK_RATE, num_threads,
+    //     "Seconds", grid);
+
+    // DistPrint("Wait_UDiagBlock_Recv  ", SCT->Wait_UDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Wait_LDiagBlock_Recv  ", SCT->Wait_LDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Recv_UDiagBlock       ", SCT->Recv_UDiagBlock_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Wait_UDiagBlockSend   ", SCT->Wait_UDiagBlockSend_tl / CPU_CLOCK_RATE, "Seconds", grid);
+
+    // DistPrint("Bcast_UPanel          ", SCT->Bcast_UPanel_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Bcast_LPanel          ", SCT->Bcast_LPanel_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    DistPrint("Wait_LSend            ", SCT->Wait_LSend_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    DistPrint("Wait_USend            ", SCT->Wait_USend_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    DistPrint("Wait_URecv            ", SCT->Wait_URecv_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    DistPrint("Wait_LRecv            ", SCT->Wait_LRecv_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    DistPrint("L_PanelUpdate         ", SCT->L_PanelUpdate_tl , "Seconds", grid);
+    DistPrint("PDGSTRS2              ", SCT->PDGSTRS2_tl , "Seconds", grid);
+    
+    DistPrint("wait-FunCallStream    ", SCT->PhiWaitTimer , "Seconds", grid);
+    DistPrint("wait-copyStream       ", SCT->PhiWaitTimer_2 , "Seconds", grid);
+    DistPrint("waitGPU2CPU           ", SCT->PhiWaitTimer , "Seconds", grid);
+    DistPrint("SchurCompUpdate       ", SCT->NetSchurUpTimer, "Seconds", grid);
+    DistPrint("PanelFactorization    ", SCT->pdgstrfTimer - SCT->NetSchurUpTimer, "Seconds", grid);
+    
+    // DistPrint("Phase_Factor          ", SCT->Phase_Factor_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Phase_LU_Update       ", SCT->Phase_LU_Update_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrint("Phase_SC_Update       ", SCT->Phase_SC_Update_tl / CPU_CLOCK_RATE, "Seconds", grid);
+    // DistPrintThreaded(
+    //     "GetAijLock         ", SCT->GetAijLock_Thread_tl, CPU_CLOCK_RATE, num_threads,
+    //     "Seconds", grid);
+    double t_total = SCT->tStartup + SCT->pdgstrfTimer + SCT->gatherLUtimer; 
+    DistPrintMarkupHeader("High Level Time Breakdown", t_total, grid);
+    DistPrint("Startup               ", SCT->tStartup, "Seconds", grid);
+    DistPrint("Main-Factor loop      ", SCT->pdgstrfTimer, "Seconds", grid);
+    DistPrint("3D-GatherLU           ", SCT->gatherLUtimer, "Seconds", grid);
+    DistPrint("tTotal                ", t_total, "Seconds", grid);
+
+    DistPrintMarkupHeader("Components of Factor Loop",SCT->pdgstrfTimer, grid);
+    DistPrint("3D-AncestorReduce     ", SCT->ancsReduce, "Seconds", grid);
+    DistPrint("Pipeline Tail         ", SCT->tAsyncPipeTail, "Seconds", grid);
+
+}
+
+void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT)
+{
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    char funName[100];
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+    for (int i = maxLvl-1; i >-1; --i)
+    {
+        /* code */
+        sprintf( funName, "Grid-%d Factor:Level-%d    ", grid3d->zscp.Iam,
+		 (int) maxLvl-1-i);
+        DistPrint(funName, SCT->tFactor3D[i], "Seconds", grid);
+        // sprintf( funName, "SchurCU:Level-%d   ",  maxLvl-1-i);
+        // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid);
+        // sprintf( funName, "PanelFact:Level-%d ",  maxLvl-1-i);
+        // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid);
+    }
+
+}
+
+
+void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT)
+{
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    char funName[100];
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+    for (int i = maxLvl-1; i >-1; --i)
+    {
+        /* code */
+        double tsum;
+        MPI_Reduce( &SCT->tFactor3D[i], &tsum,  1, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm );
+
+        double tmax;
+        MPI_Reduce( &SCT->tFactor3D[i], &tmax,  1, MPI_DOUBLE, MPI_MAX, 0, grid3d->zscp.comm );
+        
+        double tavg = tsum /(grid3d->zscp.Np>>i);
+        double lLmb =  100*(tmax-tavg)/tavg;
+        sprintf( funName, "Imbalance Factor:Level-%d    ",  (int) maxLvl-1-i);
+        if(!grid3d->zscp.Iam)
+        DistPrint(funName, lLmb, "Seconds", grid);
+        // sprintf( funName, "SchurCU:Level-%d   ",  maxLvl-1-i);
+        // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid);
+        // sprintf( funName, "PanelFact:Level-%d ",  maxLvl-1-i);
+        // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid);
+    }
+
+}
+
+
+void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT)
+{
+    //
+    double cvolFactor;
+    MPI_Reduce( &SCT->commVolFactor, &cvolFactor,  1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm );
+    double cvolRed;
+    MPI_Reduce( &SCT->commVolRed, &cvolRed,  1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm );
+
+    int_t Np = (grid3d->npcol) * (grid3d->nprow) * (grid3d->npdep);
+    if (!grid3d->iam)
+    {
+        /* code */
+        printf("| commVolRed | %g   | %g |\n", cvolRed, cvolRed/Np );
+        printf("| commVolFactor | %g   | %g |\n", cvolFactor, cvolFactor/Np );
+    }
+
+}
+
+int
+get_acc_offload ()
+{
+    char *ttemp;
+    ttemp = getenv ("SUPERLU_ACC_OFFLOAD");
+
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 0;
+}
+
+
+void Free_HyP(HyP_t* HyP)
+{
+#if 0
+    _mm_free(HyP->lookAhead_info );
+    _mm_free(HyP->Remain_info );
+    _mm_free(HyP->lookAhead_L_buff );
+    _mm_free(HyP->Remain_L_buff );
+    _mm_free(HyP->Ublock_info );
+    _mm_free(HyP->Ublock_info_Phi );
+    _mm_free(HyP->Lblock_dirty_bit );
+    _mm_free(HyP->Ublock_dirty_bit );
+#else
+    SUPERLU_FREE(HyP->lookAhead_info );
+    SUPERLU_FREE(HyP->Remain_info );
+    SUPERLU_FREE(HyP->lookAhead_L_buff );
+    SUPERLU_FREE(HyP->Remain_L_buff );
+    SUPERLU_FREE(HyP->Ublock_info );
+    SUPERLU_FREE(HyP->Ublock_info_Phi );
+    SUPERLU_FREE(HyP->Lblock_dirty_bit );
+    SUPERLU_FREE(HyP->Ublock_dirty_bit );
+#endif
+    SUPERLU_FREE(HyP);
+}
+
+int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid)
+{
+    for (int_t i = 0; i < HyP->RemainBlk; ++i)
+    {
+        int_t lib = LBi( HyP->Remain_info[i].ib, grid) ;
+        HyP->Ublock_dirty_bit[lib] = k0;
+    }
+
+
+    for (int_t j = 0; j < HyP->jj_cpu; ++j)
+    {
+        int_t ljb = LBj( HyP->Ublock_info_Phi[j].jb, grid) ;
+        HyP->Lblock_dirty_bit[ljb] = k0;
+    }
+    return 0;
+}
+
+int_t scuStatUpdate(
+    int_t knsupc,
+    HyP_t* HyP, 
+    SCT_t* SCT,
+    SuperLUStat_t *stat
+    )
+{
+    int_t Lnbrow   = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    int_t Rnbrow   = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+    int_t nbrow = Lnbrow + Rnbrow;
+    int_t ncols_host = HyP->num_u_blks == 0 ? 0 : HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    int_t ncols_Phi = HyP->num_u_blks_Phi == 0 ? 0 : HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+    int_t ncols = ncols_Phi+ ncols_host;
+    // int_t ncols   = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols
+    //           + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; // ###
+    SCT->LookAheadRowSepMOP  += 2 * (double)knsupc * (double)(nbrow);
+    SCT->GatherMOP += 2 * (double)HyP->ldu * (double)ncols;
+
+
+    SCT->LookAheadGEMMFlOp   += 2 * ((double)Lnbrow * (double)HyP->ldu * (double)ncols_host +
+                                     (double)Lnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ;
+    SCT->LookAheadScatterMOP += 3 * Lnbrow * ncols;
+    SCT->schur_flop_counter  += 2 * ((double)Rnbrow * (double)HyP->ldu * (double)ncols_host +
+                                     (double)Rnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ;
+    SCT->scatter_mem_op_counter +=  3 * Rnbrow * ncols;
+    stat->ops[FACT]     += 2 * ((double)(Rnbrow + Lnbrow) * (double)HyP->ldu * (double)ncols_host +
+                                (double)(Rnbrow + Lnbrow) * (double)HyP->ldu_Phi * (double)ncols_Phi) ;
+
+    return 0;
+
+}
diff --git a/SRC/sgather.c b/SRC/sgather.c
new file mode 100644
index 00000000..7bbc5498
--- /dev/null
+++ b/SRC/sgather.c
@@ -0,0 +1,398 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Various gather routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include <stdio.h>
+#include "superlu_sdefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t sprintMatrix(char*s, int n, int m, float* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; i<n; i++)
+    {
+        for(int j =0; j<m; j++)
+        {
+            printf("%g ", A[j*LDA +i]);
+        }
+        printf("\n");
+    }
+    return 0;
+}
+
+void sgather_u(int_t num_u_blks,
+                Ublock_info_t *Ublock_info, int_t * usub,
+              	float *uval, float *bigU, int_t ldu,
+              	int_t *xsup, int_t klst                /* for SuperSize */
+             )
+{
+    // return;
+    //  private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,lead_zero, \
+    // jj, i)
+    double zero = 0.0;
+
+#ifdef _OPENMP    
+#pragma omp parallel for default (shared) schedule(dynamic)
+#endif
+    for (int_t j = 0; j < num_u_blks; ++j)
+    {
+        float *tempu;
+        if (j == 0) tempu = bigU;
+        else tempu = bigU + ldu * Ublock_info[j - 1].full_u_cols;
+
+        int_t iukp = Ublock_info[j].iukp ;
+        int_t rukp = Ublock_info[j].rukp ;
+        int_t jb = Ublock_info[j].jb ;
+        int_t nsupc = SuperSize(jb);
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            int_t segsize = klst - usub[jj];
+            if ( segsize )
+            {
+                int_t lead_zero = ldu - segsize;
+                for (int_t i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (int_t i = 0; i < segsize; ++i)
+                {
+                    // printf("%d %d %d\n",i,rukp,segsize );
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    }
+}
+
+
+void sgather_l( int_t num_LBlk, int_t knsupc,
+               Remain_info_t *L_info,
+               float * lval, int_t LD_lval, float * L_buff )
+{
+    if (num_LBlk < 1)
+    {
+        return;
+    }
+
+    int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
+#pragma omp parallel for
+#endif
+    for (int_t i = 0; i < num_LBlk; ++i)
+    {
+        int_t StRowDest  = 0;
+        int_t temp_nbrow;
+        if (i == 0)
+        {
+            temp_nbrow = L_info[0].FullRow;
+        }
+        else
+        {
+            StRowDest   = L_info[i - 1].FullRow;
+            temp_nbrow  = L_info[i].FullRow - L_info[i - 1].FullRow;
+        }
+
+        int_t StRowSource = L_info[i].StRow;
+#if 0
+        LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', temp_nbrow, knsupc, &lval[StRowSource], LD_lval, &L_buff[StRowDest], LD_LBuff);
+#else  /* Sherry */
+	for (int j = 0; j < knsupc; ++j) {
+            memcpy( &L_buff[StRowDest + j * LD_LBuff], 
+	            &lval[StRowSource + j * LD_lval],
+	            temp_nbrow * sizeof(float) );
+        }
+#endif
+    } /* end for i ... */
+}
+
+// Rearragnes and gatehrs L blocks
+void sRgather_L( int_t k, int_t *lsub, float *lusup,
+                gEtreeInfo_t* gEtreeInfo, Glu_persist_t *Glu_persist,
+                gridinfo_t *grid, HyP_t *HyP, int_t *myIperm, int_t *iperm_c_supno )
+{
+
+    int_t temp_nbrow;
+    int_t cum_nrow = 0;
+    int_t ib;
+    int_t *xsup = Glu_persist->xsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    sgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    sgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // sprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // sprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void sRgather_U( int_t k, int_t jj0, int_t *usub,	float *uval,
+                 float *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    sgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    sgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* sRgather_U */
diff --git a/SRC/sgsequ_dist.c b/SRC/sgsequ_dist.c
new file mode 100644
index 00000000..a133fe1c
--- /dev/null
+++ b/SRC/sgsequ_dist.c
@@ -0,0 +1,204 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file sgsequ_dist.c
+ * \brief Computes row and column scalings
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from LAPACK routine SGEEQU
+ * </pre>
+ */
+/*
+ * History:     Modified from LAPACK routine SGEEQU
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose   
+ *   =======   
+ *
+ *   SGSEQU_DIST computes row and column scalings intended to equilibrate an   
+ *   M-by-N sparse matrix A and reduce its condition number. R returns the row
+ *   scale factors and C the column scale factors, chosen to try to make   
+ *   the largest element in each row and column of the matrix B with   
+ *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+ *
+ *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+ *   number and BIGNUM = largest safe number.  Use of these scaling   
+ *   factors is not guaranteed to reduce the condition number of A but   
+ *   works well in practice.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input) SuperMatrix*
+ *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
+ *           factors are to be computed. The type of A can be:
+ *           Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE.
+ *	    
+ *   R       (output) float*, size A->nrow
+ *           If INFO = 0 or INFO > M, R contains the row scale factors   
+ *           for A.
+ *	    
+ *   C       (output) float*, size A->ncol
+ *           If INFO = 0,  C contains the column scale factors for A.
+ *	    
+ *   ROWCND  (output) float*
+ *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+ *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+ *           AMAX is neither too large nor too small, it is not worth   
+ *           scaling by R.
+ *	    
+ *   COLCND  (output) float*
+ *           If INFO = 0, COLCND contains the ratio of the smallest   
+ *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+ *           worth scaling by C.
+ *	    
+ *   AMAX    (output) float*
+ *           Absolute value of largest matrix element.  If AMAX is very   
+ *           close to overflow or very close to underflow, the matrix   
+ *           should be scaled.
+ *	    
+ *   INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *           > 0:  if INFO = i,  and i is   
+ *                 <= A->nrow:  the i-th row of A is exactly zero   
+ *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
+ *
+ *   ===================================================================== 
+ * </pre>
+ */
+void
+sgsequ_dist(SuperMatrix *A, float *r, float *c, float *rowcnd,
+	float *colcnd, float *amax, int_t *info)
+{
+
+
+    /* Local variables */
+    NCformat *Astore;
+    float   *Aval;
+    int i, j, irow;
+    float rcmin, rcmax;
+    float bignum, smlnum;
+    
+    /* Test the input parameters. */
+    *info = 0;
+    if ( A->nrow < 0 || A->ncol < 0 ||
+	 A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -1;
+    if (*info != 0) {
+	i = -(*info);
+	xerr_dist("sgsequ_dist", &i);
+	return;
+    }
+
+    /* Quick return if possible */
+    if ( A->nrow == 0 || A->ncol == 0 ) {
+	*rowcnd = 1.;
+	*colcnd = 1.;
+	*amax = 0.;
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (float *) Astore->nzval;
+    
+    /* Get machine constants. */
+    smlnum = smach_dist("S");  /* slamch_("S"); */
+    bignum = 1. / smlnum;
+
+    /* Compute row scale factors. */
+    for (i = 0; i < A->nrow; ++i) r[i] = 0.;
+
+    /* Find the maximum element in each row. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (i = 0; i < A->nrow; ++i) {
+	rcmax = SUPERLU_MAX(rcmax, r[i]);
+	rcmin = SUPERLU_MIN(rcmin, r[i]);
+    }
+    *amax = rcmax;
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (i = 0; i < A->nrow; ++i)
+	    if (r[i] == 0.) {
+		*info = i + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (i = 0; i < A->nrow; ++i)
+	    r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
+	/* Compute ROWCND = min(R(I)) / max(R(I)) */
+	*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    /* Compute column scale factors */
+    for (j = 0; j < A->ncol; ++j) c[j] = 0.;
+
+    /* Find the maximum element in each column, assuming the row
+       scalings computed above. */
+    for (j = 0; j < A->ncol; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
+	}
+
+    /* Find the maximum and minimum scale factors. */
+    rcmin = bignum;
+    rcmax = 0.;
+    for (j = 0; j < A->ncol; ++j) {
+	rcmax = SUPERLU_MAX(rcmax, c[j]);
+	rcmin = SUPERLU_MIN(rcmin, c[j]);
+    }
+
+    if (rcmin == 0.) {
+	/* Find the first zero scale factor and return an error code. */
+	for (j = 0; j < A->ncol; ++j)
+	    if ( c[j] == 0. ) {
+		*info = A->nrow + j + 1;
+		return;
+	    }
+    } else {
+	/* Invert the scale factors. */
+	for (j = 0; j < A->ncol; ++j)
+	    c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
+	/* Compute COLCND = min(C(J)) / max(C(J)) */
+	*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
+    }
+
+    return;
+
+} /* sgsequ_dist */
+
+
diff --git a/SRC/slangs_dist.c b/SRC/slangs_dist.c
new file mode 100644
index 00000000..0d81c5f5
--- /dev/null
+++ b/SRC/slangs_dist.c
@@ -0,0 +1,130 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file slangs_dist.c
+ * \brief Returns the value of the one norm, the infinity norm, or the element of largest value
+ * Modified from SuperLU routine SLANGS
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * </pre>
+ */
+/*
+ * File name:	slangs_dist.c
+ * History:     Modified from lapack routine SLANGE
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose   
+ *   =======   
+ *
+ *   SLANGS_DIST returns the value of the one norm, or the Frobenius norm, or 
+ *   the infinity norm, or the element of largest absolute value of a 
+ *   real matrix A.   
+ *
+ *   Description   
+ *   ===========   
+ *
+ *   SLANGE returns the value   
+ *
+ *      SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+ *               (   
+ *               ( norm1(A),         NORM = '1', 'O' or 'o'   
+ *               (   
+ *               ( normI(A),         NORM = 'I' or 'i'   
+ *               (   
+ *               ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+ *
+ *   where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+ *   normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+ *   normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+ *   squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   NORM    (input) CHARACTER*1   
+ *           Specifies the value to be returned in SLANGE as described above.   
+ *   A       (input) SuperMatrix*
+ *           The M by N sparse matrix A. 
+ *
+ *  =====================================================================
+ * </pre>
+ */
+
+float slangs_dist(char *norm, SuperMatrix *A)
+{
+    
+    /* Local variables */
+    NCformat *Astore;
+    float   *Aval;
+    int      i, j, irow;
+    float   value = 0.0, sum;
+    float   *rwork;
+
+    Astore = (NCformat *) A->Store;
+    Aval   = (float *) Astore->nzval;
+    
+    if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) {
+	value = 0.;
+	
+    } else if ( (strncmp(norm, "M", 1)==0 ) ) {
+	/* Find max(abs(A(i,j))). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++)
+		value = SUPERLU_MAX( value, fabs( Aval[i]) );
+	
+    } else if (strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') {
+	/* Find norm1(A). */
+	value = 0.;
+	for (j = 0; j < A->ncol; ++j) {
+	    sum = 0.;
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) 
+		sum += fabs(Aval[i]);
+	    value = SUPERLU_MAX(value,sum);
+	}
+	
+    } else if (strncmp(norm, "I", 1)==0) {
+	/* Find normI(A). */
+	if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) )
+	    ABORT("SUPERLU_MALLOC fails for rwork.");
+	for (i = 0; i < A->nrow; ++i) rwork[i] = 0.;
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) {
+		irow = Astore->rowind[i];
+		rwork[irow] += fabs(Aval[i]);
+	    }
+	value = 0.;
+	for (i = 0; i < A->nrow; ++i)
+	    value = SUPERLU_MAX(value, rwork[i]);
+	
+	SUPERLU_FREE (rwork);
+	
+    } else if (strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0) {
+	/* Find normF(A). */
+	ABORT("Not implemented.");
+    } else
+	ABORT("Illegal norm specified.");
+
+    return (value);
+
+} /* slangs_dist */
+
diff --git a/SRC/slaqgs_dist.c b/SRC/slaqgs_dist.c
new file mode 100644
index 00000000..bef736b5
--- /dev/null
+++ b/SRC/slaqgs_dist.c
@@ -0,0 +1,154 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file slaqgs_dist.c
+ * \brief Equlibrates a general sprase matrix
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Modified from LAPACK routine SLAQGE
+ * </pre>
+ */
+/*
+ * File name:	slaqgs_dist.c
+ * History:     Modified from LAPACK routine SLAQGE
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ *   Purpose   
+ *   =======   
+ *
+ *   SLAQGS_DIST equilibrates a general sparse M by N matrix A using the row and   
+ *   scaling factors in the vectors R and C.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input/output) SuperMatrix*
+ *           On exit, the equilibrated matrix.  See EQUED for the form of 
+ *           the equilibrated matrix. The type of A can be:
+ *	    Stype = NC; Dtype = SLU_S; Mtype = GE.
+ *	    
+ *   R       (input) float*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *	    
+ *   C       (input) float*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *	    
+ *   ROWCND  (input) float
+ *           Ratio of the smallest R(i) to the largest R(i).
+ *	    
+ *   COLCND  (input) float
+ *           Ratio of the smallest C(i) to the largest C(i).
+ *	    
+ *   AMAX    (input) float
+ *           Absolute value of largest matrix entry.
+ *	    
+ *   EQUED   (output) char*
+ *           Specifies the form of equilibration that was done.   
+ *           = 'N':  No equilibration   
+ *           = 'R':  Row equilibration, i.e., A has been premultiplied by  
+ *                   diag(R).   
+ *           = 'C':  Column equilibration, i.e., A has been postmultiplied  
+ *                   by diag(C).   
+ *           = 'B':  Both row and column equilibration, i.e., A has been
+ *                   replaced by diag(R) * A * diag(C).   
+ *
+ *   Internal Parameters   
+ *   ===================   
+ *
+ *   THRESH is a threshold value used to decide if row or column scaling   
+ *   should be done based on the ratio of the row or column scaling   
+ *   factors.  If ROWCND < THRESH, row scaling is done, and if   
+ *   COLCND < THRESH, column scaling is done.   
+ *
+ *   LARGE and SMALL are threshold values used to decide if row scaling   
+ *   should be done based on the absolute size of the largest matrix   
+ *   element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+ *
+ *   ===================================================================== 
+ * </pre>
+ */
+
+void
+slaqgs_dist(SuperMatrix *A, float *r, float *c, 
+	float rowcnd, float colcnd, float amax, char *equed)
+{
+
+#define THRESH    (0.1)
+    
+    /* Local variables */
+    NCformat *Astore;
+    float   *Aval;
+    int i, j, irow;
+    float large, small, cj;
+
+    /* Quick return if possible */
+    if (A->nrow <= 0 || A->ncol <= 0) {
+	*(unsigned char *)equed = 'N';
+	return;
+    }
+
+    Astore = (NCformat *) A->Store;
+    Aval = (float *) Astore->nzval;
+    
+    /* Initialize LARGE and SMALL. */
+    small = smach_dist("Safe minimum") / smach_dist("Precision");
+    large = 1. / small;
+
+    if (rowcnd >= THRESH && amax >= small && amax <= large) {
+	if (colcnd >= THRESH)
+	    *(unsigned char *)equed = 'N';
+	else {
+	    /* Column scaling */
+	    for (j = 0; j < A->ncol; ++j) {
+		cj = c[j];
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    Aval[i] *= cj;
+                }
+	    }
+	    *(unsigned char *)equed = 'C';
+	}
+    } else if (colcnd >= THRESH) {
+	/* Row scaling, no column scaling */
+	for (j = 0; j < A->ncol; ++j)
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		Aval[i] *= r[irow];
+	    }
+	*(unsigned char *)equed = 'R';
+    } else {
+	/* Row and column scaling */
+	for (j = 0; j < A->ncol; ++j) {
+	    cj = c[j];
+	    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		irow = Astore->rowind[i];
+		Aval[i] *= cj * r[irow];
+	    }
+	}
+	*(unsigned char *)equed = 'B';
+    }
+
+    return;
+
+} /* slaqgs_dist */
+
diff --git a/SRC/sldperm_dist.c b/SRC/sldperm_dist.c
new file mode 100644
index 00000000..6178c1b0
--- /dev/null
+++ b/SRC/sldperm_dist.c
@@ -0,0 +1,175 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Finds a row permutation so that the matrix has large entries on the diagonal
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
+		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
+		    int_t [], int_t []);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ *   SLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal
+ *              entries of the permuted matrix are one in absolute value and
+ *              all the off-diagonal entries are less than or equal to one in
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) float*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors.
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors.
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * </pre>
+ */
+
+int
+sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
+	float nzval[], int_t *perm, float u[], float v[])
+{
+    int_t i, liw, ldw, num;
+    int_t *iw, icntl[10], info[10];
+    double *dw;
+    extern double *doubleMalloc_dist(int_t);
+    double *nzval_d = doubleMalloc_dist(nnz);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter sldperm_dist()");
+#endif
+    liw = 5*n;
+    if ( job == 3 ) liw = 10*n + nnz;
+    if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]");
+    ldw = 3*n + nnz;
+    if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]");
+
+    /* Increment one to get 1-based indexing. */
+    for (i = 0; i <= n; ++i) ++colptr[i];
+    for (i = 0; i < nnz; ++i) ++adjncy[i];
+#if ( DEBUGlevel>=2 )
+    printf("LDPERM(): n %d, nnz %d\n", n, nnz);
+    PrintInt10("colptr", n+1, colptr);
+    PrintInt10("adjncy", nnz, adjncy);
+#endif
+
+    /*
+     * NOTE:
+     * =====
+     *
+     * MC64AD assumes that column permutation vector is defined as:
+     * perm(i) = j means column i of permuted A is in column j of original A.
+     *
+     * Since a symmetric permutation preserves the diagonal entries. Then
+     * by the following relation:
+     *     P'(A*P')P = P'A
+     * we can apply inverse(perm) to rows of A to get large diagonal entries.
+     * But, since 'perm' defined in MC64AD happens to be the reverse of
+     * SuperLU's definition of permutation vector, therefore, it is already
+     * an inverse for our purpose. We will thus use it directly.
+     *
+     */
+    mc64id_dist(icntl);
+    /* Suppress error and warning messages. */
+    icntl[0] = -1;
+    icntl[1] = -1;
+
+    for (i = 0; i < nnz; ++i) nzval_d[i] = nzval[i];
+    mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval_d, &num, perm,
+	        &liw, iw, &ldw, dw, icntl, info);
+
+#if ( DEBUGlevel>=2 )
+    PrintInt10("perm", n, perm);
+    printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
+#endif
+    if ( info[0] == 1 ) { /* Structurally singular */
+        printf(".. The last " IFMT " permutations:\n", n-num);
+	PrintInt10("perm", n-num, &perm[num]);
+    }
+
+    /* Restore to 0-based indexing. */
+    for (i = 0; i <= n; ++i) --colptr[i];
+    for (i = 0; i < nnz; ++i) --adjncy[i];
+    for (i = 0; i < n; ++i) --perm[i];
+
+    if ( job == 5 )
+        for (i = 0; i < n; ++i) {
+	    u[i] = dw[i];
+	    v[i] = dw[n+i];
+	}
+
+    SUPERLU_FREE(iw);
+    SUPERLU_FREE(dw);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit sldperm_dist()");
+#endif
+   return (info[0]);
+}
+
diff --git a/SRC/slook_ahead_update.c b/SRC/slook_ahead_update.c
new file mode 100644
index 00000000..5a6999cd
--- /dev/null
+++ b/SRC/slook_ahead_update.c
@@ -0,0 +1,278 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/************************************************************************/
+/*! @file
+ * \brief Look-ahead update of the Schur complement.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *  September 18, 2017
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ *
+ */
+
+#include <assert.h>  /* assertion doesn't work if NDEBUG is defined */
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+j = jj0 = 0;  /* After the j-loop, jj0 points to the first block in U
+                 outside look-ahead window. */
+
+#if 0
+for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
+
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    float zero = 0.0;
+
+#if 1
+    /* Search is needed because a permutation perm_u is involved for j  */
+    /* Search along the row for the pointers {iukp, rukp} pointing to
+     * block U(k,j).
+     * j    -- current block in look-ahead window, initialized to 0 on entry
+     * iukp -- point to the start of index[] metadata
+     * rukp -- point to the start of nzval[] array
+     * jb   -- block number of block U(k,j), update destination column
+     */
+    arrive_at_ublock(
+		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         	     iukp0, rukp0, usub, perm_u, xsup, grid
+		    );
+#else
+    jb = usub[iukp];
+    ljb = LBj (jb, grid);     /* Local block number of U(k,j). */
+    nsupc = SuperSize(jb);
+    iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
+    j++;
+    jj0++;
+    jj = iukp;
+
+    while (usub[jj] == klst) ++jj; /* Skip zero segments */
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+
+    /* This loop computes ldu. */
+    for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize > ldu)  ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+	    iam, k, jb, ldu, ncols, nsupc);
+    ++num_copy;
+#endif
+
+    /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+    tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+    for (jj = iukp; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            lead_zero = ldu - segsize;
+            for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+            tempu += lead_zero;
+            for (i = 0; i < segsize; ++i) {
+                tempu[i] = uval[rukp + i];
+            }
+            rukp += segsize;
+            tempu += segsize;
+        }
+    }
+    tempu = bigU; /* set back to the beginning of the buffer */
+
+    nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
+    // double ttx =SuperLU_timer_();
+
+    int current_b = 0; /* Each thread starts searching from first block.
+                          This records the moving search target.           */
+    lptr = lptr0; /* point to the start of index[] in supernode L(:,k) */
+    luptr = luptr0;
+
+#ifdef _OPENMP
+    /* Sherry -- examine all the shared variables ??
+       'firstprivate' ensures that the private variables are initialized
+       to the values before entering the loop.  */
+#pragma omp parallel for \
+    firstprivate(lptr,luptr,ib,current_b) private(lb) \
+    default(shared) schedule(dynamic)
+#endif
+    for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
+        int temp_nbrow; /* automatic variable is private */
+
+        /* Search for the L block that my thread will work on.
+           No need to search from 0, can continue at the point where
+           it is left from last iteration.
+           Note: Blocks may not be sorted in L. Different thread picks up
+	   different lb.   */
+        for (; current_b < lb; ++current_b) {
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;   /* move to next block */
+            luptr += temp_nbrow;  /* move to next block */
+        }
+
+#ifdef _OPENMP
+        int_t thread_id = omp_get_thread_num ();
+#else
+        int_t thread_id = 0;
+#endif
+        float * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread  = indirect + ldt * thread_id;
+        int *indirect2_thread = indirect2 + ldt * thread_id;
+        ib = lsub[lptr];        /* block number of L(i,k) */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+	/* assert (temp_nbrow <= nbrow); */
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+	/*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
+        /* calling gemm */
+	stat->ops[FACT] += 2.0 * (flops_t)temp_nbrow * ldu * ncols;
+#if defined (USE_VENDOR_BLAS)
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	}
+#endif
+        /* Now scattering the output. */
+        if (ib < jb) {    /* A(i,j) is in U. */
+            sscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            sscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread,
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+        ++current_b;         /* Move to next block. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadScatterTimer += tt_end - tt_start;
+	}
+#endif
+    } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
+
+    iukp += nsupc; /* Mov to block U(k,j+1) */
+
+    /* =========================================== *
+     * == factorize L(:,j) and send if possible == *
+     * =========================================== */
+    kk = jb; /* destination column that is just updated */
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+        /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+
+        double tt1 = SuperLU_timer_();
+
+        PSGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_() - tt1;
+
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Multicasts numeric values of L(:,kk) to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d, tags %d:%d \n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj,
+			SLU_MPI_TAG(0,kk0), SLU_MPI_TAG(1,kk0));
+#endif
+            }  /* end if ( ToSendR[lk][pj] != EMPTY ) */
+        } /* end for pj ... */
+    } /* end if( look_ahead[kk] == k0 && kcol == mycol ) */
+} /* end while j < nub and perm_u[j] <k0+NUM_LOOK_AHEAD */
+
diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h
new file mode 100644
index 00000000..1fdeba1d
--- /dev/null
+++ b/SRC/slustruct_gpu.h
@@ -0,0 +1,237 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * </pre>
+ */
+
+#pragma once // so that this header file is included onle once
+
+#include "superlu_sdefs.h"
+
+#ifdef GPU_ACC // enable GPU
+#include "gpublas_utils.h"
+// #include "mkl.h"
+// #include "sec_structs.h"
+// #include "supernodal_etree.h"
+
+/* Constants */
+//#define SLU_TARGET_GPU 0
+//#define MAX_BLOCK_SIZE 10000
+#define MAX_NGPU_STREAMS 32
+
+static
+void check(gpuError_t result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
+                file, line, gpuGetErrorString(result), func);
+
+        // Make sure we call GPU Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define checkGPUErrors(val)  check ( (val), #val, __FILE__, __LINE__ )
+
+typedef struct //SCUbuf_gpu_
+{
+    /*Informations for various buffers*/
+    float *bigV;
+    float *bigU;
+    float *bigU_host;      /*pinned location*/
+    int_t *indirect;        /*for indirect address calculations*/
+    int_t *indirect2;       /*for indirect address calculations*/
+
+    float *Remain_L_buff;  /* on GPU */
+    float *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
+    
+    int_t *lsub;
+    int_t *usub;
+
+    int_t *lsub_buf, *usub_buf;
+    
+    Ublock_info_t *Ublock_info; /* on GPU */
+    Remain_info_t *Remain_info;
+    Ublock_info_t *Ublock_info_host;
+    Remain_info_t *Remain_info_host;
+
+    int_t* usub_IndirectJ3;  /* on GPU */
+    int_t* usub_IndirectJ3_host;
+
+} sSCUbuf_gpu_t;
+
+/* Holds the L & U data structures on the GPU side */
+typedef struct //LUstruct_gpu_ 
+{
+    int_t   *LrowindVec;      /* A single vector */
+    int_t   *LrowindPtr;      /* A single vector */
+
+    float  *LnzvalVec;       /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
+
+    int_t   *UrowindVec;            /* A single vector */
+    int_t   *UrowindPtr;            /* A single vector */
+    int_t   *UrowindPtr_host;       /* A single vector */
+    int_t   *UnzvalPtr_host;
+
+    float  *UnzvalVec;       /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
+    /*gpu pointers for easy block accesses */
+    local_l_blk_info_t *local_l_blk_infoVec;
+    int_t *local_l_blk_infoPtr;
+    int_t *jib_lookupVec;
+    int_t *jib_lookupPtr;
+    local_u_blk_info_t *local_u_blk_infoVec;
+
+    int_t *local_u_blk_infoPtr;
+    int_t *ijb_lookupVec;
+    int_t *ijb_lookupPtr;
+
+    // GPU buffers for performing Schur Complement Update on GPU
+    sSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
+    float *acc_L_buff, *acc_U_buff;
+
+    /*Informations for various buffers*/
+    int_t buffer_size;      /**/
+    int_t nsupers;  /*should have number of supernodes*/
+    int_t *xsup;
+    gridinfo_t *grid;
+
+    double ScatterMOPCounter;
+    double ScatterMOPTimer;
+    double GemmFLOPCounter;
+    double GemmFLOPTimer;
+
+    double cPCIeH2D;
+    double cPCIeD2H;
+    double tHost_PCIeH2D;
+    double tHost_PCIeD2H;
+
+    /*gpu events to measure DGEMM and SCATTER timing */
+    int *isOffloaded;  /*stores if any iteration is offloaded or not*/
+    gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd;  /*gpu events to store gemm and scatter's begin and end*/
+    gpuEvent_t *ePCIeH2D;
+    gpuEvent_t *ePCIeD2H_Start;
+    gpuEvent_t *ePCIeD2H_End;
+
+    int_t *xsup_host;
+    int_t* perm_c_supno;
+    int_t first_l_block_gpu, first_u_block_gpu;
+} sLUstruct_gpu_t;
+
+typedef struct //sluGPU_t_
+{
+    int_t gpuId;        // if there are multiple GPUs
+    sLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
+    gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
+    gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
+    int_t lastOffloadStream[MAX_NGPU_STREAMS];
+    int_t nGPUStreams;
+    int* isNodeInMyGrid;
+    double acc_async_cost;
+} ssluGPU_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ssparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    sscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    sLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    sdiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    ssluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, 
+    SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info);
+
+int sinitD2Hreduce(
+    int next_k,
+    d2Hreduce_t* d2Hred,
+    int last_flag,
+    // int_t *perm_c_supno,
+    HyP_t* HyP,
+    ssluGPU_t *sluGPU,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct, SCT_t* SCT
+);
+
+extern int sreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
+   	ssluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
+	 sLUstruct_t *LUstruct);
+
+extern int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT);
+extern int ssendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, ssluGPU_t *sluGPU);
+extern int ssendSCUdataHost2GPU(
+    int_t streamId, int_t* lsub, int_t* usub, float* bigU, int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,  ssluGPU_t *sluGPU, HyP_t* HyP
+);
+
+extern int sinitSluGPU3D_t(
+    ssluGPU_t *sluGPU,
+    sLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt
+);
+int sSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    ssluGPU_t *sluGPU, gridinfo_t *grid
+);
+
+
+extern void sCopyLUToGPU3D (int* isNodeInMyGrid, sLocalLU_t *A_host,
+           ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
+	   gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt);
+
+extern int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+                              int_t** treePerm,    sLUValSubBuf_t*LUvsb,
+                              sLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
+                              ssluGPU_t *sluGPU,  d2Hreduce_t* d2Hred,
+                              factStat_t *factStat, HyP_t* HyP, SCT_t* SCT );
+
+extern void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT);
+extern int sfree_LUstruct_gpu (sLUstruct_gpu_t *A_gpu);
+
+//int freeSluGPU(ssluGPU_t *sluGPU);
+
+extern void sPrint_matrix( char *desc, int_t m, int_t n, float *dA, int_t lda );
+
+/*to print out various statistics*/
+void sprintGPUStats(sLUstruct_gpu_t *A_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // matching: enable GPU
diff --git a/SRC/smemory_dist.c b/SRC/smemory_dist.c
new file mode 100644
index 00000000..8c9ef510
--- /dev/null
+++ b/SRC/smemory_dist.c
@@ -0,0 +1,286 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Memory utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+
+/* Variables external to this file */
+extern SuperLU_LU_stack_t stack;
+
+
+void *suser_malloc_dist(int_t bytes, int_t which_end)
+{
+    void *buf;
+
+    if ( SuperLU_StackFull(bytes) ) return (NULL);
+
+    if ( which_end == HEAD ) {
+	buf = (char*) stack.array + stack.top1;
+	stack.top1 += bytes;
+    } else {
+	stack.top2 -= bytes;
+	buf = (char*) stack.array + stack.top2;
+    }
+
+    stack.used += bytes;
+    return buf;
+}
+
+
+void suser_free_dist(int_t bytes, int_t which_end)
+{
+    if ( which_end == HEAD ) {
+	stack.top1 -= bytes;
+    } else {
+	stack.top2 += bytes;
+    }
+    stack.used -= bytes;
+}
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * </pre>
+ */
+int_t sQuerySpace_dist(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
+		       SuperLUStat_t *stat, superlu_dist_mem_usage_t *mem_usage)
+{
+    register int_t dword, gb, iword, k, nb, nsupers;
+    int_t *index, *xsup;
+    int iam, mycol, myrow;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    iword = sizeof(int_t);
+    dword = sizeof(float);
+    nsupers = Glu_persist->supno[n-1] + 1;
+    xsup = Glu_persist->xsup;
+    mem_usage->for_lu = 0.;
+
+    /* For L factor */
+    nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->npcol + mycol; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Lrowind_bc_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)
+		    ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
+		mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
+	    }
+	}
+    }
+
+    /* For U factor */
+    nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
+    for (k = 0; k < nb; ++k) {
+	gb = k * grid->nprow + myrow; /* Global block number. */
+	if ( gb < nsupers ) {
+	    index = Llu->Ufstnz_br_ptr[k];
+	    if ( index ) {
+		mem_usage->for_lu += (float)(index[2] * iword);
+		mem_usage->for_lu += (float)(index[1] * dword);
+	    }
+	}
+    }
+
+    /* Working storage to support factorization */
+    mem_usage->total = mem_usage->for_lu;
+#if 0
+    mem_usage->total +=
+	(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
+		( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
+    /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
+    mem_usage->total +=
+	(float)( Llu->bufmax[0] * iword +  Llu->bufmax[1] * dword );
+    mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
+    k = CEILING( nsupers, grid->nprow );
+    mem_usage->total += (float)(2 * k * iword);
+#else
+    /*mem_usage->total += stat->current_buffer;*/
+    mem_usage->total += stat->peak_buffer;
+
+#if ( PRNTlevel>=1 )
+    if (iam==0) printf(".. sQuerySpace: peak_buffer %.2f (MB)\n",
+                       stat->peak_buffer * 1.0e-6);
+#endif
+#endif
+    return 0;
+} /* sQuerySpace_dist */
+
+
+/*
+ * Allocate storage for original matrix A
+ */
+void
+sallocateA_dist(int_t n, int_t nnz, float **a, int_t **asub, int_t **xa)
+{
+    *a    = (float *) floatMalloc_dist(nnz);
+    *asub = (int_t *) intMalloc_dist(nnz);
+    *xa   = (int_t *) intMalloc_dist(n+1);
+}
+
+
+float *floatMalloc_dist(int_t n)
+{
+    float *buf;
+    buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float) );
+    return (buf);
+}
+
+float *floatCalloc_dist(int_t n)
+{
+    float *buf;
+    register int_t i;
+    float zero = 0.0;
+    buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float));
+    if ( !buf ) return (buf);
+    for (i = 0; i < n; ++i) buf[i] = zero;
+    return (buf);
+}
+
+/***************************************
+ * The following are from 3D code.
+ ***************************************/
+
+double sgetLUMem(int_t nodeId, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+{
+    double memlu = 0.0;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    // double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( nodeId, grid );
+    if (mycol == pc)
+    {
+        int_t ljb = LBj( nodeId, grid ); /* Local block number */
+        int_t  *lsub;
+        float* lnzval;
+        lsub = Lrowind_bc_ptr[ljb];
+        lnzval = Lnzval_bc_ptr[ljb];
+
+        if (lsub != NULL)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t  len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(nodeId) * len;
+            memlu += 1.0 * (len1 * sizeof(int_t)  + len2 * sizeof(float));
+        }
+    }
+
+    int_t pr = PROW( nodeId, grid );
+    if (myrow == pr)
+    {
+        int_t lib = LBi( nodeId, grid ); /* Local block number */
+        int_t  *usub;
+        // double* unzval;
+        usub = Ufstnz_br_ptr[lib];
+
+        if (usub != NULL)
+        {
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            memlu += 1.0 * (lenv * sizeof(int_t)  + lens * sizeof(float));
+        }
+    }
+    return memlu;
+}
+
+double  smemForest(sForest_t*sforest, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+{
+    double memlu = 0;
+
+    int_t *perm_c_supno = sforest->nodeList;
+    int_t nnodes =   sforest->nNodes;
+    for (int i = 0; i < nnodes; ++i)
+    {
+        memlu += sgetLUMem(perm_c_supno[i], LUstruct, grid3d);
+    }
+
+    return memlu;
+}
+
+void s3D_printMemUse( trf3Dpartition_t*  trf3Dpartition,  sLUstruct_t *LUstruct,
+		      gridinfo3d_t * grid3d )
+{
+    int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+
+    double memNzLU = 0.0;
+    double memzLU = 0.0;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+    for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)
+    {
+        sForest_t* sforest = sForests[myTreeIdxs[ilvl]];
+
+        if (sforest)
+        {
+            if (!myZeroTrIdxs[ilvl])
+            {
+                memNzLU += smemForest(sforest, LUstruct, grid3d);
+            }
+            else
+            {
+                memzLU += smemForest(sforest, LUstruct, grid3d);
+            }
+        }
+    }
+    double sumMem = memNzLU + memzLU;
+    double maxMem, minMem,  avgNzLU, avgzLU;
+    /*Now reduce it among all the procs*/
+    MPI_Reduce(&sumMem, &maxMem, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm);
+    MPI_Reduce(&sumMem, &minMem, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm);
+    MPI_Reduce(&memNzLU, &avgNzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm);
+    MPI_Reduce(&memzLU, &avgzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm);
+
+    int_t nProcs = grid3d->nprow * grid3d->npcol * grid3d->npdep;
+    if (!(grid3d->iam))
+    {
+        /* code */
+        printf("| Total Memory \t| %.2g  \t| %.2g  \t|%.2g  \t|\n", (avgNzLU + avgzLU) / nProcs, maxMem, minMem );
+        printf("| LU-LU(repli) \t| %.2g  \t| %.2g  \t|\n", (avgNzLU) / nProcs, avgzLU / nProcs );
+    }
+}
+
diff --git a/SRC/smyblas2_dist.c b/SRC/smyblas2_dist.c
new file mode 100644
index 00000000..34281a54
--- /dev/null
+++ b/SRC/smyblas2_dist.c
@@ -0,0 +1,248 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Level 2 BLAS operations: solves and matvec, written in C
+ *
+ * <pre>
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * </pre>
+ */
+/*
+ * File name:		smyblas2.c
+ * Purpose:
+ *     Level 2 BLAS operations: solves and matvec, written in C.
+ * Note:
+ *     This is only used when the system lacks an efficient BLAS library.
+ */
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense UNIT lower triangular system. The unit lower
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol).
+ * The solution will be returned in the rhs vector.
+ * </pre>
+ */
+void slsolve ( int ldm, int ncol, float *M, float *rhs )
+{
+    int k;
+    float x0, x1, x2, x3, x4, x5, x6, x7;
+    float *M0;
+    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+
+    M0 = &M[0];
+
+    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+      Mki4 = Mki3 + ldm + 1;
+      Mki5 = Mki4 + ldm + 1;
+      Mki6 = Mki5 + ldm + 1;
+      Mki7 = Mki6 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++;
+      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++;
+      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
+      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
+			   - x6 * *Mki6++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      rhs[++firstcol] = x4;
+      rhs[++firstcol] = x5;
+      rhs[++firstcol] = x6;
+      rhs[++firstcol] = x7;
+      ++firstcol;
+
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++
+                        - x4 * *Mki4++ - x5 * *Mki5++
+			- x6 * *Mki6++ - x7 * *Mki7++;
+
+      M0 += 8 * ldm + 8;
+    }
+
+    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      ++firstcol;
+
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++;
+
+      M0 += 4 * ldm + 4;
+    }
+
+    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+
+      rhs[++firstcol] = x1;
+      ++firstcol;
+
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
+
+    }
+    return;
+}
+
+/*! \brief
+ *
+ * <pre>
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ * </pre>
+ */
+void
+susolve (
+	int ldm,	/* in */
+	int ncol,	/* in */
+	float *M,	/* in */
+	float *rhs	/* modified */
+)
+{
+    float xj;
+    int jcol, j, irow;
+
+    jcol = ncol - 1;
+
+    for (j = 0; j < ncol; j++) {
+
+	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
+	rhs[jcol] = xj;
+
+	for (irow = 0; irow < jcol; irow++)
+	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
+
+	jcol--;
+
+    }
+    return;
+}
+
+
+/*! \brief
+ *
+ * <pre>
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ * </pre>
+ */
+void smatvec (
+	int ldm,	/* in -- leading dimension of M */
+	int nrow,	/* in */
+	int ncol,	/* in */
+	float *M,	/* in */
+	float *vec,	/* in */
+	float *Mxvec	/* in/out */
+)
+{
+    float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
+    float *M0;
+    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+    int k;
+
+    M0 = &M[0];
+    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+        Mki2 = Mki1 + ldm;
+        Mki3 = Mki2 + ldm;
+	Mki4 = Mki3 + ldm;
+	Mki5 = Mki4 + ldm;
+	Mki6 = Mki5 + ldm;
+	Mki7 = Mki6 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];
+	vi4 = vec[firstcol++];
+	vi5 = vec[firstcol++];
+	vi6 = vec[firstcol++];
+	vi7 = vec[firstcol++];
+
+	for (k = 0; k < nrow; k++)
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++
+		      + vi4 * *Mki4++ + vi5 * *Mki5++
+		      + vi6 * *Mki6++ + vi7 * *Mki7++;
+
+	M0 += 8 * ldm;
+    }
+
+    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+	Mki2 = Mki1 + ldm;
+	Mki3 = Mki2 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];
+	for (k = 0; k < nrow; k++)
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
+
+	M0 += 4 * ldm;
+    }
+
+    while ( firstcol < ncol ) {		/* Do 1 column */
+
+ 	Mki0 = M0;
+	vi0 = vec[firstcol++];
+	for (k = 0; k < nrow; k++)
+	    Mxvec[k] += vi0 * *Mki0++;
+
+	M0 += ldm;
+    }
+    return;
+}
+
diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c
new file mode 100644
index 00000000..5140cc24
--- /dev/null
+++ b/SRC/snrformat_loc3d.c
@@ -0,0 +1,575 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, float *Dst, int lddst, float *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void sGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ float *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = floatMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (float *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	float *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	float *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = floatMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = floatMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_FLOAT, B1,
+		    b_counts_int, b_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((float*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* sGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int sScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    float *B = (float *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    float *B2d = (float *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    float *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = floatMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    float *Btmp; // on 3D grid
+    Btmp = floatMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_FLOAT,
+		     Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_FLOAT,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* sScatter_B3d */
diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c
index 94db174a..ca97bc36 100644
--- a/SRC/sp_colorder.c
+++ b/SRC/sp_colorder.c
@@ -223,13 +223,13 @@ sp_colorder(superlu_dist_options_t *options,  SuperMatrix *A, int_t *perm_c,
 int
 check_perm_dist(char *what, int_t n, int_t *perm)
 {
-    register int_t i;
+    register int i;
     int_t          *marker;
     marker = (int_t *) intCalloc_dist(n);
 
     for (i = 0; i < n; ++i) {
 	if ( perm[i] >= n || marker[perm[i]] == 1 ) {
-	    printf("%s: Not a valid PERM[" IFMT "] = " IFMT "\n", 
+	    printf("%s: Not a valid PERM[%d] = " IFMT "\n", 
 		   what, i, perm[i]);
 	    ABORT("check_perm_dist");
 	} else {
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 08d1e8f1..e7ea44db 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -53,6 +53,10 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+            = 8: the maximum buffer size on GPU that can hold the "dC"
+	         matrix in the GEMM call for the Schur complement update.
+		 If this is too small, the Schur complement update will be
+		 done in multiple partitions, may be slower.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -62,13 +66,11 @@ at the top-level directory.
 </pre>
 */
 
-
 #include <stdlib.h>
 #include <stdio.h>
 
-
-int_t
-sp_ienv_dist(int_t ispec)
+int
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
@@ -91,16 +93,16 @@ sp_ienv_dist(int_t ispec)
                 return(atoi(ttemp));
             }
             else
-            return 20;
+		return 60; // 20
             
 	case 3: 
-            ttemp = getenv("NSUP");
+	    ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h
             if(ttemp)
             {
-                return(atoi(ttemp));
+		int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE );
+                return (k);
             }
-            else
-            return 128;
+            else return 256;  // 128;
 
 #endif
         case 6: 
@@ -110,8 +112,11 @@ sp_ienv_dist(int_t ispec)
         case 7:
 	    ttemp = getenv ("N_GEMM");
 	    if (ttemp) return atoi (ttemp);
-	    else return 10000;
-
+	    else return 100;  // 10000;
+        case 8:
+  	    ttemp = getenv ("MAX_BUFFER_SIZE");
+	    if (ttemp) return atoi (ttemp);
+	    else return 256000000; // 256000000 = 16000^2
     }
 
     /* Invalid value for ISPEC */
@@ -119,6 +124,5 @@ sp_ienv_dist(int_t ispec)
     xerr_dist("sp_ienv", &i);
     return 0;
 
-
 } /* sp_ienv_dist */
 
diff --git a/SRC/sreadMM.c b/SRC/sreadMM.c
new file mode 100644
index 00000000..d2ac8c8e
--- /dev/null
+++ b/SRC/sreadMM.c
@@ -0,0 +1,244 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief
+ * Contributed by Francois-Henry Rouet.
+ *
+ */
+#include <ctype.h>
+#include <stdio.h>
+#include "superlu_sdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+sreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    float **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    float *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+    char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64];
+    int expand;
+    char *cs;
+
+    /* 	File format:
+     *    %%MatrixMarket matrix coordinate real general/symmetric/...
+     *    % ...
+     *    % (optional comments)
+     *    % ...
+     *    #rows    #non-zero
+     *    Triplet in the rest of lines: row    col    value
+     */
+
+     /* 1/ read header */
+     cs = fgets(line,512,fp);
+     for (p=line; *p!='\0'; *p=tolower(*p),p++);
+
+     if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
+       printf("Invalid header (first line does not contain 5 tokens)\n");
+       exit(-1);
+     }
+
+     if(strcmp(banner,"%%matrixmarket")) {
+       printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n");
+       exit(-1);
+     }
+
+     if(strcmp(mtx,"matrix")) {
+       printf("Not a matrix; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(crd,"coordinate")) {
+       printf("Not in coordinate format; this driver cannot handle that.\n");
+       exit(-1);
+     }
+
+     if(strcmp(arith,"real")) {
+       if(!strcmp(arith,"complex")) {
+         printf("Complex matrix; use zreadMM instead!\n");
+         exit(-1);
+       }
+       else if(!strcmp(arith, "pattern")) {
+         printf("Pattern matrix; values are needed!\n");
+         exit(-1);
+       }
+       else {
+         printf("Unknown arithmetic\n");
+         exit(-1);
+       }
+     }
+
+     if(strcmp(sym,"general")) {
+       printf("Symmetric matrix: will be expanded\n");
+       expand=1;
+     } else
+       expand=0;
+
+     /* 2/ Skip comments */
+     while(banner[0]=='%') {
+       cs = fgets(line,512,fp);
+       sscanf(line,"%s",banner);
+     }
+
+     /* 3/ Read n and nnz */
+#ifdef _LONGINT
+    sscanf(line, "%lld%lld%lld", m, n, nonz);
+#else
+    sscanf(line, "%d%d%d",m, n, nonz);
+#endif
+
+    if(*m!=*n) {
+      printf("Rectangular matrix!. Abort\n");
+      exit(-1);
+   }
+
+    if(expand)
+      new_nonz = 2 * *nonz - *n;
+    else
+      new_nonz = *nonz;
+
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    fflush(stdout);
+    sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = floatMalloc_dist(new_nonz)) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) intMalloc_dist(new_nonz)) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) intMalloc_dist(new_nonz)) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* 4/ Read triplets of values */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+
+	j = fscanf(fp, IFMT IFMT "%f\n", &row[nz], &col[nz], &val[nz]);
+
+	if ( nnz == 0 ) /* first nonzero */ {
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else
+		printf("triplet file: row/col indices are one-based.\n");
+	    fflush(stdout);
+	}
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n",
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+            if(expand) {
+	        if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	          ++nz;
+	          row[nz] = col[nz-1];
+	          col[nz] = row[nz-1];
+	          val[nz] = val[nz-1];
+	          ++xa[col[nz]];
+	        }
+            }
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+    if(expand) {
+      printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz);
+      fflush(stdout);
+    }
+
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+static void sreadrhs(int m, float *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "sreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      i = fscanf(fp, "%lf\n", &b[i]);
+      /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/
+    /*        readpair_(j, &b[i]);*/
+    fclose(fp);
+}
diff --git a/SRC/sreadhb.c b/SRC/sreadhb.c
new file mode 100644
index 00000000..4476af93
--- /dev/null
+++ b/SRC/sreadhb.c
@@ -0,0 +1,389 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Read a FLOAT PRECISION matrix stored in Harwell-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_sdefs.h"
+
+/*
+ * Prototypes
+ */
+static void ReadVector(FILE *, int_t, int_t *, int_t, int_t);
+static void sReadValues(FILE *, int_t, float *, int_t, int_t);
+static void FormFullA(int_t, int_t *, float **, int_t **, int_t **);
+static int DumpLine(FILE *);
+static int ParseIntFormat(char *, int_t *, int_t *);
+static int ParseFloatFormat(char *, int_t *, int_t *);
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * Read a FLOAT PRECISION matrix stored in Harwell-Boeing format
+ * as described below.
+ *
+ * Line 1 (A72,A8)
+ *  	Col. 1 - 72   Title (TITLE)
+ *	Col. 73 - 80  Key (KEY)
+ *
+ * Line 2 (5I14)
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD)
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD)
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD)
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD)
+ *                    (including starting guesses and solution vectors
+ *		       if present)
+ *           	      (zero indicates no right-hand side data is present)
+ *
+ * Line 3 (A3, 11X, 4I14)
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW)
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL)
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO)
+ *	              (equal to number of entries for assembled matrices)
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL)
+ *	              (zero in the case of assembled matrices)
+ * Line 4 (2A16, 2A20)
+ * 	Col. 1 - 16   Format for pointers (PTRFMT)
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT)
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT)
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT)
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present
+ *    	Col. 1 	      Right-hand side type:
+ *	         	  F for full storage or M for same format as matrix
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP)
+ *    	Col. 3        X if an exact solution vector(s) is supplied.
+ *	Col. 15 - 28  Number of right-hand sides (NRHS)
+ *	Col. 29 - 42  Number of row indices (NRHSIX)
+ *          	      (ignored in case of unassembled matrices)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *	R Real matrix
+ *	C Complex matrix
+ *	P Pattern only (no numerical values supplied)
+ *
+ * Second Character:
+ *	S Symmetric
+ *	U Unsymmetric
+ *	H Hermitian
+ *	Z Skew symmetric
+ *	R Rectangular
+ *
+ * Third Character:
+ *	A Assembled
+ *	E Elemental matrices (unassembled)
+ * </pre>
+ */
+
+void
+sreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+	     float **nzval, int_t **rowind, int_t **colptr)
+{
+
+    register int_t i, numer_lines, rhscrd = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int_t sym;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter sreadhb_dist()");
+#endif
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+
+    /* Line 2 */
+    for (i=0; i<5; i++) {
+	fscanf(fp, "%14c", buf); buf[14] = 0;
+	tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+	if (i == 3) numer_lines = tmp;
+	if (i == 4 && tmp) rhscrd = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
+    if (tmp != 0)
+	if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+	if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    fscanf(fp, "%20c", buf);
+    DumpLine(fp);
+
+    /* Line 5: right-hand side */
+    if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */
+
+#if ( DEBUGlevel>=1 )
+    if ( !iam ) {
+	printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+	printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+	printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+	printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]);
+#endif
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+#if ( DEBUGlevel>=1 )
+    if ( !iam )	printf("read rowind[" IFMT "] = " IFMT "\n", *nonz-1, (*rowind)[*nonz-1]);
+#endif
+    if ( numer_lines ) {
+        sReadValues(fp, *nonz, *nzval, valnum, valsize);
+#if ( DEBUGlevel>=1 )
+	if ( !iam ) printf("read nzval[" IFMT "] = %e\n", *nonz-1, (*nzval)[*nonz-1]);
+#endif
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit sreadhb_dist()");
+#endif
+}
+
+/* Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp);
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp);
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp);
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+	   && *tmp != 'F' && *tmp != 'f') {
+       /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp);
+
+    return 0;
+}
+
+static void
+ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    item = atoi(&buf[j*persize]);
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	    where[i++] = item - 1;
+	}
+    }
+}
+
+void
+sReadValues(FILE *fp, int_t n, float *destination,
+             int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+	fgets(buf, 100, fp);    /* read a line at a time */
+	for (j=0; j<perline && i<n; j++) {
+	    tmp = buf[(j+1)*persize];     /* save the char at that place */
+	    buf[(j+1)*persize] = 0;       /* null terminate */
+	    s = j*persize;
+	    for (k = 0; k < persize; ++k) /* No D_ format in C */
+		if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+	    destination[i++] = atof(&buf[s]);
+	    buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+	}
+    }
+}
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+static void
+FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    float *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+#if (DEBUGlevel >= 2)
+	  if ( fabs(a_val[k]) < 4.047e-300 )
+	      printf("%5d: %e\n", k, a_val[k]);
+#endif
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+#if (DEBUGlevel >= 2)
+	if ( fabs(a_val[k]) < 4.047e-300 )
+	    printf("%5d: %e\n", k, a_val[k]);
+#endif
+	++k;
+      }
+
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
diff --git a/SRC/sreadrb.c b/SRC/sreadrb.c
new file mode 100644
index 00000000..7165536e
--- /dev/null
+++ b/SRC/sreadrb.c
@@ -0,0 +1,346 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file sreadrb.c
+ * \brief Read a matrix stored in Rutherford-Boeing format
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * </pre>
+ *
+ * Purpose
+ * =======
+ *
+ * Read a FLOAT PRECISION matrix stored in Rutherford-Boeing format
+ * as described below.
+ *
+ * Line 1 (A72, A8)
+ *      Col. 1 - 72   Title (TITLE)
+ *      Col. 73 - 80  Matrix name / identifier (MTRXID)
+ *
+ * Line 2 (I14, 3(1X, I13))
+ *      Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ *      Col. 16 - 28  Number of lines for pointers (PTRCRD)
+ *      Col. 30 - 42  Number of lines for row (or variable) indices (INDCRD)
+ *      Col. 44 - 56  Number of lines for numerical values (VALCRD)
+ *
+ * Line 3 (A3, 11X, 4(1X, I13))
+ *      Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ *      Col. 15 - 28  Compressed Column: Number of rows (NROW)
+ *                    Elemental: Largest integer used to index variable (MVAR)
+ *      Col. 30 - 42  Compressed Column: Number of columns (NCOL)
+ *                    Elemental: Number of element matrices (NELT)
+ *      Col. 44 - 56  Compressed Column: Number of entries (NNZERO)
+ *                    Elemental: Number of variable indeces (NVARIX)
+ *      Col. 58 - 70  Compressed Column: Unused, explicitly zero
+ *                    Elemental: Number of elemental matrix entries (NELTVL)
+ *
+ * Line 4 (2A16, A20)
+ *      Col. 1 - 16   Fortran format for pointers (PTRFMT)
+ *      Col. 17 - 32  Fortran format for row (or variable) indices (INDFMT)
+ *      Col. 33 - 52  Fortran format for numerical values of coefficient matrix
+ *                    (VALFMT)
+ *                    (blank in the case of matrix patterns)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *      R Real matrix
+ *      C Complex matrix
+ *      I integer matrix
+ *      P Pattern only (no numerical values supplied)
+ *      Q Pattern only (numerical values supplied in associated auxiliary value
+ *        file)
+ *
+ * Second Character:
+ *      S Symmetric
+ *      U Unsymmetric
+ *      H Hermitian
+ *      Z Skew symmetric
+ *      R Rectangular
+ *
+ * Third Character:
+ *      A Compressed column form
+ *      E Elemental form
+ *
+ * </pre>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "superlu_sdefs.h"
+
+/*! \brief Eat up the rest of the current line */
+static int DumpLine(FILE *fp)
+{
+    register int c;
+    while ((c = fgetc(fp)) != '\n') ;
+    return 0;
+}
+
+static int ParseIntFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp);
+    while (*tmp != 'I' && *tmp != 'i') ++tmp;
+    ++tmp;
+    *size = atoi(tmp);
+    return 0;
+}
+
+static int ParseFloatFormat(char *buf, int_t *num, int_t *size)
+{
+    char *tmp, *period;
+
+    tmp = buf;
+    while (*tmp++ != '(') ;
+    *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+    while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
+           && *tmp != 'F' && *tmp != 'f') {
+        /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
+           num picked up refers to P, which should be skipped. */
+        if (*tmp=='p' || *tmp=='P') {
+           ++tmp;
+           *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
+        } else {
+           ++tmp;
+        }
+    }
+    ++tmp;
+    period = tmp;
+    while (*period != '.' && *period != ')') ++period ;
+    *period = '\0';
+    *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
+
+    return 0;
+}
+
+static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize)
+{
+    register int_t i, j, item;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+        fgets(buf, 100, fp);    /* read a line at a time */
+        for (j=0; j<perline && i<n; j++) {
+            tmp = buf[(j+1)*persize];     /* save the char at that place */
+            buf[(j+1)*persize] = 0;       /* null terminate */
+            item = atoi(&buf[j*persize]);
+            buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+            where[i++] = item - 1;
+        }
+    }
+
+    return 0;
+}
+
+static int sReadValues(FILE *fp, int_t n, float *destination,
+        int_t perline, int_t persize)
+{
+    register int_t i, j, k, s;
+    char tmp, buf[100];
+
+    i = 0;
+    while (i < n) {
+        fgets(buf, 100, fp);    /* read a line at a time */
+        for (j=0; j<perline && i<n; j++) {
+            tmp = buf[(j+1)*persize];     /* save the char at that place */
+            buf[(j+1)*persize] = 0;       /* null terminate */
+            s = j*persize;
+            for (k = 0; k < persize; ++k) /* No D_ format in C */
+                if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
+            destination[i++] = atof(&buf[s]);
+            buf[(j+1)*persize] = tmp;     /* recover the char at that place */
+        }
+    }
+
+    return 0;
+}
+
+
+
+/*! \brief
+ *
+ * <pre>
+ * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric
+ * matrix. On exit, it represents the full matrix with lower and upper parts.
+ * </pre>
+ */
+static void
+FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, j, k, col, new_nnz;
+    int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr;
+    int_t *marker;
+    float *t_val, *al_val, *a_val;
+
+    al_rowind = *rowind;
+    al_colptr = *colptr;
+    al_val = *nzval;
+
+    if ( !(marker = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+    if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_val[]");
+
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i)
+	    ++marker[al_rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose matrix A to T */
+    for (j = 0; j < n; ++j)
+	for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	    col = al_rowind[i];
+	    t_rowind[marker[col]] = j;
+	    t_val[marker[col]] = al_val[i];
+	    ++marker[col];
+	}
+
+    new_nnz = *nonz * 2 - n;
+    if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC a_colptr[]");
+    if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_rowind[]");
+    if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) )
+	ABORT("SUPERLU_MALLOC fails for a_val[]");
+
+    a_colptr[0] = 0;
+    k = 0;
+    for (j = 0; j < n; ++j) {
+      for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	if ( t_rowind[i] != j ) { /* not diagonal */
+	  a_rowind[k] = t_rowind[i];
+	  a_val[k] = t_val[i];
+	  ++k;
+	}
+      }
+
+      for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) {
+	a_rowind[k] = al_rowind[i];
+	a_val[k] = al_val[i];
+	++k;
+      }
+
+      a_colptr[j+1] = k;
+    }
+
+    printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k);
+
+    SUPERLU_FREE(al_val);
+    SUPERLU_FREE(al_rowind);
+    SUPERLU_FREE(al_colptr);
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_val);
+    SUPERLU_FREE(t_rowind);
+    SUPERLU_FREE(t_colptr);
+
+    *nzval = a_val;
+    *rowind = a_rowind;
+    *colptr = a_colptr;
+    *nonz = new_nnz;
+}
+
+void
+sreadrb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz,
+        float **nzval, int_t **rowind, int_t **colptr)
+{
+    register int_t i, numer_lines = 0;
+    int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
+    char buf[100], type[4];
+    int sym;
+
+    /* Line 1 */
+    fgets(buf, 100, fp);
+    fputs(buf, stdout);
+
+    /* Line 2 */
+    for (i=0; i<4; i++) {
+        fscanf(fp, "%14c", buf); buf[14] = 0;
+        tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/
+        if (i == 3) numer_lines = tmp;
+    }
+    DumpLine(fp);
+
+    /* Line 3 */
+    fscanf(fp, "%3c", type);
+    fscanf(fp, "%11c", buf); /* pad */
+    type[3] = 0;
+#if (DEBUGlevel >= 1)
+    if ( !iam ) printf("Matrix type %s\n", type);
+#endif
+
+    fscanf(fp, "%14c", buf); *nrow = atoi(buf);
+    fscanf(fp, "%14c", buf); *ncol = atoi(buf);
+    fscanf(fp, "%14c", buf); *nonz = atoi(buf);
+    fscanf(fp, "%14c", buf); tmp = atoi(buf);
+
+    if (tmp != 0)
+        if ( !iam ) printf("This is not an assembled matrix!\n");
+    if (*nrow != *ncol)
+        if ( !iam ) printf("Matrix is not square.\n");
+    DumpLine(fp);
+
+    /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
+    sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
+
+    /* Line 4: format statement */
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &colnum, &colsize);
+    fscanf(fp, "%16c", buf);
+    ParseIntFormat(buf, &rownum, &rowsize);
+    fscanf(fp, "%20c", buf);
+    ParseFloatFormat(buf, &valnum, &valsize);
+    DumpLine(fp);
+
+#if (DEBUGlevel >= 1)
+    if ( !iam ) {
+        printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz);
+        printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize);
+        printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize);
+        printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize);
+    }
+#endif
+
+    ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
+    ReadVector(fp, *nonz, *rowind, rownum, rowsize);
+    if ( numer_lines ) {
+        sReadValues(fp, *nonz, *nzval, valnum, valsize);
+    }
+
+    sym = (type[1] == 'S' || type[1] == 's');
+    if ( sym ) {
+	FormFullA(*ncol, nonz, nzval, rowind, colptr);
+    }
+
+}
diff --git a/SRC/sreadtriple.c b/SRC/sreadtriple.c
new file mode 100644
index 00000000..7e10f4ac
--- /dev/null
+++ b/SRC/sreadtriple.c
@@ -0,0 +1,181 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief
+ *
+ */
+#include <stdio.h>
+#include "superlu_sdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    float **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    j, k, jsize, nnz, nz, new_nonz;
+    float *a, *val;
+    int_t    *asub, *xa, *row, *col;
+    int_t    zero_base = 0;
+
+    /* 	File format:
+     *    First line:  #rows    #non-zero
+     *    Triplet in the rest of lines:
+     *                 row    col    value
+     */
+
+#ifdef _LONGINT
+    fscanf(fp, "%lld%lld%lld", m, n, nonz);
+#else
+    fscanf(fp, "%d%d%d", m, n, nonz);
+#endif
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+    *m = *n;
+    printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz);
+    sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+
+#ifdef _LONGINT
+        fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]);
+#else // int 
+        fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( nnz == 0 ) /* first nonzero */
+	    if ( row[0] == 0 || col[0] == 0 ) {
+		zero_base = 1;
+		printf("triplet file: row/col indices are zero-based.\n");
+	    } else {
+		printf("triplet file: row/col indices are one-based.\n");
+     	    }
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n",
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    int i;
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+
+void sreadrhs(int m, float *b)
+{
+    FILE *fp, *fopen();
+    int i;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "sreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%f\n", &b[i]);
+    /*        readpair_(j, &b[i]);*/
+
+    fclose(fp);
+}
+
+
diff --git a/SRC/sreadtriple_noheader.c b/SRC/sreadtriple_noheader.c
new file mode 100644
index 00000000..32905984
--- /dev/null
+++ b/SRC/sreadtriple_noheader.c
@@ -0,0 +1,199 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief
+ *
+ */
+#include <stdio.h>
+#include "superlu_sdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * <pre>
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * </pre>
+ */
+
+void
+sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
+	    float **nzval, int_t **rowind, int_t **colptr)
+{
+    int_t    i, j, k, jsize, lasta, nnz, nz, new_nonz, minn = 100;
+    float *a, *val, vali;
+    int_t    *asub, *xa, *row, *col;
+    int      zero_base = 0, ret_val = 0;
+
+    /* 	File format: Triplet in a line for each nonzero entry:
+     *                 row    col    value
+     *         or      row    col    real_part	imaginary_part
+     */
+
+    /* First pass: determine N and NNZ */
+    nz = *n = 0;
+
+#ifdef _LONGINT
+    ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali);
+#else  // int
+    ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali);
+#endif
+
+    while (ret_val != EOF) {
+	*n = SUPERLU_MAX(*n, i);
+	*n = SUPERLU_MAX(*n, j);
+	minn = SUPERLU_MIN(minn, i);
+	minn = SUPERLU_MIN(minn, j);
+	++nz;
+
+#ifdef _LONGINT
+    	ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali);
+#else  // int
+        ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali);
+#endif
+    }
+    
+    if ( minn == 0 ) { /* zero-based indexing */
+	zero_base = 1;
+	++(*n);
+	printf("triplet file: row/col indices are zero-based.\n");
+    } else {
+	printf("triplet file: row/col indices are one-based.\n");
+    }
+
+    *m = *n;
+    *nonz = nz;
+    rewind(fp);
+
+#ifdef EXPAND_SYM
+    new_nonz = 2 * *nonz - *n;
+#else
+    new_nonz = *nonz;
+#endif
+
+    /* Second pass: read the actual matrix values */
+    printf("m %ld, n %ld, nonz %ld\n", (long int) *m, (long int) *n, (long int) *nonz);
+    sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */
+    a    = *nzval;
+    asub = *rowind;
+    xa   = *colptr;
+
+    if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) )
+        ABORT("Malloc fails for val[]");
+    if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for row[]");
+    if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) )
+        ABORT("Malloc fails for col[]");
+
+    for (j = 0; j < *n; ++j) xa[j] = 0;
+
+    /* Read into the triplet array from a file */
+    for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
+#ifdef _LONGINT
+	fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]);
+#else // int32
+	fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]);
+#endif
+
+	if ( !zero_base ) {
+	    /* Change to 0-based indexing. */
+	    --row[nz];
+	    --col[nz];
+	}
+
+	if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n
+	    /*|| val[nz] == 0.*/) {
+	    fprintf(stderr, "nz" IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n",
+		    nz, row[nz], col[nz], val[nz]);
+	    exit(-1);
+	} else {
+	    ++xa[col[nz]];
+#ifdef EXPAND_SYM
+	    if ( row[nz] != col[nz] ) { /* Excluding diagonal */
+	      ++nz;
+	      row[nz] = col[nz-1];
+	      col[nz] = row[nz-1];
+	      val[nz] = val[nz-1];
+	      ++xa[col[nz]];
+	    }
+#endif
+	    ++nz;
+	}
+    }
+
+    *nonz = nz;
+#ifdef EXPAND_SYM
+    printf("new_nonz after symmetric expansion:\t%d\n", *nonz);
+#endif
+
+
+    /* Initialize the array of column pointers */
+    k = 0;
+    jsize = xa[0];
+    xa[0] = 0;
+    for (j = 1; j < *n; ++j) {
+	k += jsize;
+	jsize = xa[j];
+	xa[j] = k;
+    }
+
+    /* Copy the triplets into the column oriented storage */
+    for (nz = 0; nz < *nonz; ++nz) {
+	j = col[nz];
+	k = xa[j];
+	asub[k] = row[nz];
+	a[k] = val[nz];
+	++xa[j];
+    }
+
+    /* Reset the column pointers to the beginning of each column */
+    for (j = *n; j > 0; --j)
+	xa[j] = xa[j-1];
+    xa[0] = 0;
+
+    SUPERLU_FREE(val);
+    SUPERLU_FREE(row);
+    SUPERLU_FREE(col);
+
+#ifdef CHK_INPUT
+    for (i = 0; i < *n; i++) {
+	printf("Col %d, xa %d\n", i, xa[i]);
+	for (k = xa[i]; k < xa[i+1]; k++)
+	    printf("%d\t%16.10f\n", asub[k], a[k]);
+    }
+#endif
+
+}
+
+#if 0
+void sreadrhs(int m, float *b)
+{
+    FILE *fp, *fopen();
+    int i, j;
+
+    if ( !(fp = fopen("b.dat", "r")) ) {
+        fprintf(stderr, "zreadrhs: file does not exist\n");
+	exit(-1);
+    }
+    for (i = 0; i < m; ++i)
+      fscanf(fp, "%f\n", &b[i]);
+
+    fclose(fp);
+}
+#endif
+
diff --git a/SRC/sscatter.c b/SRC/sscatter.c
new file mode 100644
index 00000000..3fc90962
--- /dev/null
+++ b/SRC/sscatter.c
@@ -0,0 +1,524 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 6.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 18, 2017, enable SIMD vectorized scatter operation.
+ *
+ */
+#include <math.h>
+#include "superlu_sdefs.h"
+
+void
+sscatter_l_1 (int ib,
+           int ljb,
+           int nsupc,
+           int_t iukp,
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int * usub,
+           int * lsub,
+           float *tempv,
+           int * indirect_thread,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+	   gridinfo_t * grid)
+{
+    // TAU_STATIC_TIMER_START("SCATTER_LB");
+    // printf("hello\n");
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    while (ijb != ib)
+    {
+        /* Search for dest block --
+           blocks are not ordered! */
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+
+        ijb = index[lptrj];
+    }
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    lptrj += LB_DESCRIPTOR;
+    for (i = 0; i < index[lptrj - 1]; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    // tempv =bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        // printf("segsize %d \n",segsize);
+        if (segsize) {
+            /*#pragma _CRI cache_bypass nzval,tempv */
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                nzval[indirect_thread[rel]] -= tempv[i];
+                // printf("i (src) %d, perm (dest) %d  \n",i,indirect_thread[rel]);
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                // if(!(*(long*)&zz == *(long*)&tempv[i]) )
+                printf ("(%d %d, %0.3e, %0.3e, %3e ) ", ljb,
+                        nzval - Lnzval_bc_ptr[ljb] + indirect_thread[rel],
+                        nzval[indirect_thread[rel]] + tempv[i],
+                        nzval[indirect_thread[rel]],tempv[i]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }
+            // printf("\n");
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }
+        nzval += ldv;
+        // printf("%d\n",nzval );
+    }
+    // TAU_STATIC_TIMER_STOP("SCATTER_LB");
+} /* sscatter_l_1 */
+
+void
+sscatter_l (
+           int ib,    /* row block number of source block L(i,k) */
+           int ljb,   /* local column block number of dest. block L(i,j) */
+           int nsupc, /* number of columns in destination supernode */
+           int_t iukp, /* point to destination supernode's index[] */
+           int_t* xsup,
+           int klst,
+           int nbrow,  /* LDA of the block in tempv[] */
+           int_t lptr, /* Input, point to index[] location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* usub,
+           int_t* lsub,
+           float *tempv,
+           int* indirect_thread,int* indirect2,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+           gridinfo_t * grid)
+{
+
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the destination lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)  /* Search for destination block L(i,j) */
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+    /*
+     * Build indirect table. This is needed because the indices are not sorted
+     * in the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow=index[lptrj - 1];
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    for (i = 0; i < dest_nbrow; ++i) {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    /* can be precalculated? */
+    for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] =indirect_thread[rel];
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* sscatter_l */
+
+
+void
+sscatter_u (int ib,
+           int jb,
+           int nsupc,
+           int_t iukp,
+           int_t * xsup,
+           int klst,
+ 	   int nbrow,      /* LDA of the block in tempv[] */
+           int_t lptr,     /* point to index location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* lsub,
+           int_t* usub,
+           float* tempv,
+           int_t ** Ufstnz_br_ptr, float **Unzval_br_ptr,
+           gridinfo_t * grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    // TAU_STATIC_TIMER_START("SCATTER_U");
+    // TAU_STATIC_TIMER_START("SCATTER_UB");
+
+    int_t jj, i, fnz, rel;
+    int segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* Reinitilize the pointers to the beginning of the k-th column/row of
+     * L/U factors.
+     * usub[] - index array for panel U(k,:)
+     */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb) {   /* Search for destination block. */
+        ruip_lib += index[iuip_lib + 1];
+        // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor. Now point to fstnz index of block U(i,j). */
+    iuip_lib += UB_DESCRIPTOR;
+
+    // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        if (segsize) {          /* Nonzero segment in U(k,j). */
+            ucol = &Unzval_br_ptr[lib][ruip_lib];
+
+            // printf("========Entering loop=========\n");
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
+                // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
+                ucol[rel] -= tempv[i];
+
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                if (!(*(long *) &zz == *(long *) &tempv[i]))
+                    printf ("(%d, %0.3e, %0.3e ) ", rel, ucol[rel] + tempv[i],
+                            ucol[rel]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            } /* for i = 0:temp_nbropw */
+            tempv += nbrow; /* Jump LDA to next column */
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }  /* if segsize */
+
+        ruip_lib += ilst - fnz;
+
+    }  /* for jj = 0:nsupc */
+#ifdef PI_DEBUG
+    // printf("\n");
+#endif
+    // TAU_STATIC_TIMER_STOP("SCATTER_UB");
+} /* sscatter_u */
+
+
+/*Divide CPU-GPU dgemm work here*/
+#ifdef PI_DEBUG
+int Ngem = 2;
+// int_t Ngem = 0;
+int min_gpu_col = 6;
+#else
+
+    // int_t Ngem = 0;
+
+#endif
+
+
+#if 0 // Sherry: moved and corrected in util.c 
+#ifdef GPU_ACC
+
+void
+gemm_division_cpu_gpu(
+    int* num_streams_used,  /*number of streams that will be used */
+    int* stream_end_col,    /*array holding last column blk for each partition */
+    int * ncpu_blks,        /*Number of CPU dgemm blks */
+    /*input */
+    int nbrow,              /*number of row in A matrix */
+    int ldu,                /*number of k in dgemm */
+    int nstreams,
+    int* full_u_cols,       /*array containing prefix sum of work load */
+    int num_blks            /*Number of work load */
+)
+{
+    int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_gpublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int i, j;
+
+    // {
+    //     *num_streams_used=0;
+    //     *ncpu_blks = num_blks;
+    //     return;
+    // }
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (full_u_cols[num_blks - 1] < (Ngem / (nbrow * ldu)) || num_blks == 1 )
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+#ifdef PI_DEBUG
+        printf ("full_u_cols[num_blks-1] %d  %d \n",
+                full_u_cols[num_blks - 1], (Ngem / (nbrow * ldu)));
+        printf ("Early return \n");
+#endif
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (full_u_cols[i + 1] > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+        full_u_cols[num_blks - 1] - full_u_cols[*ncpu_blks - 1];
+
+#ifdef PI_DEBUG
+    printf ("Remaining cols %d num_blks %d cpu_blks %d \n", cols_remain,
+            num_blks, *ncpu_blks);
+#endif
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+#ifdef PI_DEBUG
+        printf ("%d %d  %d %d \n", full_u_cols[num_blks - 1],
+                full_u_cols[*ncpu_blks], *ncpu_blks, nstreams);
+#endif
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+#ifdef PI_DEBUG
+        printf ("cols_per_stream :\t%d\n", cols_per_stream);
+#endif
+
+        int_t cutoff = cols_per_stream + full_u_cols[*ncpu_blks - 1];
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+#ifdef PI_DEBUG
+                printf ("i %d, j %d, %d  %d ", i, j, full_u_cols[j + 1],
+                        cutoff);
+#endif
+                if (full_u_cols[j + 1] > cutoff)
+                {
+#ifdef PI_DEBUG
+                    printf ("cutoff met \n");
+#endif
+                    cutoff = cols_per_stream + full_u_cols[j];
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+#ifdef PI_DEBUG
+                printf ("\n");
+#endif
+            }
+
+        }
+
+    }
+}
+
+void
+gemm_division_new (int * num_streams_used,   /*number of streams that will be used */
+                   int * stream_end_col, /*array holding last column blk for each partition */
+                   int * ncpu_blks,  /*Number of CPU dgemm blks */
+                        /*input */
+                   int nbrow,    /*number of row in A matrix */
+                   int ldu,  /*number of k in dgemm */
+                   int nstreams,
+                   Ublock_info_t *Ublock_info,    /*array containing prefix sum of work load */
+                   int num_blks  /*Number of work load */
+    )
+{
+    int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_gpublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int_t i, j;
+
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (Ublock_info[num_blks - 1].full_u_cols < (Ngem / (nbrow * ldu)) || num_blks == 1)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (Ublock_info[i + 1].full_u_cols > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+       Ublock_info [num_blks - 1].full_u_cols - Ublock_info[*ncpu_blks - 1].full_u_cols;
+
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+
+        int_t cutoff = cols_per_stream + Ublock_info[*ncpu_blks - 1].full_u_cols;
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+                if (Ublock_info[j + 1].full_u_cols > cutoff)
+                {
+
+                    cutoff = cols_per_stream + Ublock_info[j].full_u_cols;
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+
+            }
+
+        }
+
+    }
+}
+
+#endif  /* defined GPU_ACC */
+
+#endif // comment out the above code 
diff --git a/SRC/sscatter3d.c b/SRC/sscatter3d.c
new file mode 100644
index 00000000..c43a7460
--- /dev/null
+++ b/SRC/sscatter3d.c
@@ -0,0 +1,625 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, float *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+sblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    float *L_mat, int ldl,
+                    float *U_mat, int ldu,
+                    float *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    float alpha = 1.0, beta = 0.0;
+
+    /* calling SGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        sscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+sblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         float *L_mat, int_t ldl,
+                         float *U_mat, int_t ldu,
+                         float *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling SGEMM */
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        sscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t sblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 float* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 sLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sgemm_scatterTopLeft */
+
+int_t sblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  sLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sblock_gemm_scatterTopRight */
+
+int_t sblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    sLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomLeft */
+
+int_t sblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     sLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           float *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/ssp_blas2_dist.c b/SRC/ssp_blas2_dist.c
new file mode 100644
index 00000000..279b97e8
--- /dev/null
+++ b/SRC/ssp_blas2_dist.c
@@ -0,0 +1,501 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Sparse BLAS 2, using some dense BLAS 2 operations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		ssp_blas2_dist.c
+ * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
+ */
+
+#include "superlu_sdefs.h"
+
+
+/* 
+ * Function prototypes 
+ */
+#ifndef USE_VENDOR_BLAS
+extern void susolve(int, int, float*, float*);
+extern void slsolve(int, int, float*, float*);
+extern void smatvec(int, int, int, float*, float*, float*);
+#endif
+
+/*! \brief
+ *
+ * <pre>
+ *   Purpose
+ *   =======
+ *
+ *   sp_strsv_dist() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.   
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes, i.e.,
+ *             L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
+ *    
+ *   x       - (input/output) float*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * <pre>
+ */
+int
+sp_strsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+	      SuperMatrix *U, float *x, int *info)
+{
+
+#ifdef _CRAY
+    _fcd ftcs1, ftcs2, ftcs3;
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    float   *Lval, *Uval;
+    int incx = 1, incy = 1;
+    float alpha = 1.0, beta = 1.0;
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    float *work;
+    flops_t solve_ops;
+    /*extern SuperLUStat_t SuperLUStat;*/
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U",1) !=0 ) *info = -1;
+    else if ( strncmp(trans, "N",1) !=0 && strncmp(trans, "T", 1) !=0 )
+	*info = -2;
+    else if ( strncmp(diag, "U", 1) !=0 && strncmp(diag, "N", 1) != 0 )
+	*info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerr_dist("sp_strsv_dist", &i);
+	return 0;
+    }
+
+    Lstore = (SCformat *) L->Store;
+    Lval = (float *) Lstore->nzval;
+    Ustore = (NCformat *) U->Store;
+    Uval = (float *) Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = floatCalloc_dist(L->nrow)) )
+	ABORT("Malloc fails for work in sp_dtrsv_dist().");
+    
+    if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = SuperLU_L_FST_SUPC(k);
+		istart = SuperLU_L_SUB_START(fsupc);
+		nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+		nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+		luptr = SuperLU_L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+	        solve_ops += nsupc * (nsupc - 1);
+	        solve_ops += 2 * nrow * nsupc;
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < SuperLU_L_SUB_START(fsupc+1); ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			++luptr;
+			x[irow] -= x[fsupc] * Lval[luptr];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("L", strlen("L"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx, 1, 1, 1);
+		
+		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
+#endif /* _CRAY */		
+#else
+		    slsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    smatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+			&x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			x[irow] -= work[i];	/* Scatter */
+			work[i] = 0.0;
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+    	        solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		    for (i = SuperLU_U_NZ_START(fsupc); i < SuperLU_U_NZ_START(fsupc+1); ++i) {
+			irow = SuperLU_U_SUB(i);
+			x[irow] -= x[fsupc] * Uval[i];
+		    }
+
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("U", strlen("U"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else		
+		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    	for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); 
+				i++) {
+			    irow = SuperLU_U_SUB(i);
+			    x[irow] -= x[jcol] * Uval[i];
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	istart = SuperLU_L_SUB_START(fsupc);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		solve_ops += 2 * (nsupr - nsupc) * nsupc;
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = SuperLU_L_NZ_START(jcol) + nsupc; 
+				i < SuperLU_L_NZ_START(jcol+1); i++) {
+			irow = SuperLU_L_SUB(iptr);
+			x[jcol] -= x[irow] * Lval[i];
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += nsupc * (nsupc - 1);
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); i++) {
+			irow = SuperLU_U_SUB(i);
+			x[jcol] -= x[irow] * Uval[i];
+		    }
+		}
+
+		solve_ops += nsupc * (nsupc + 1);
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
+    SUPERLU_FREE(work);
+    return 0;
+} /* sp_strsv_dist */
+
+
+/*! \brief SpGEMV
+<pre>
+  Purpose   
+    =======   
+
+    sp_sgemv_dist()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) double
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
+
+    X      - (input) float*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) float*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+</pre>
+*/
+int
+sp_sgemv_dist(char *trans, float alpha, SuperMatrix *A, float *x, 
+	      int incx, float beta, float *y, int incy)
+{
+
+    /* Local variables */
+    NCformat *Astore;
+    float   *Aval;
+    int info;
+    float temp, temp1;
+    int lenx, leny, i, j, irow;
+    int iy, jx, jy, kx, ky;
+    int notran;
+    float zero = 0.0;
+    float one = 1.0;
+
+    notran = (strncmp(trans, "N", 1)==0);
+    Astore = (NCformat *) A->Store;
+    Aval = (float *) Astore->nzval;
+    
+    /* Test the input parameters */
+    info = 0;
+    if ( !notran && strncmp(trans, "T", 1) !=0 && strncmp(trans, "C", 1) != 0)
+	info = 1;
+    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
+    else if (incx == 0) info = 5;
+    else if (incy == 0)	info = 8;
+    if (info != 0) {
+	xerr_dist("sp_sgemv_dist ", &info);
+	return 0;
+    }
+
+    /* Quick return if possible. */
+    if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.)
+	return 0;
+
+    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+       up the start points in  X  and  Y. */
+    if ( strncmp(trans, "N", 1)==0 ) {
+	lenx = A->ncol;
+	leny = A->nrow;
+    } else {
+	lenx = A->nrow;
+	leny = A->ncol;
+    }
+    if (incx > 0) kx = 0;
+    else kx =  - (lenx - 1) * incx;
+    if (incy > 0) ky = 0;
+    else ky =  - (leny - 1) * incy;
+
+    /* Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+    /* First form  y := beta*y. */
+    if (beta != 1.) {
+	if (incy == 1) {
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) y[i] = zero;
+	    else
+		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
+	} else {
+	    iy = ky;
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = zero;
+		    iy += incy;
+		}
+	    else
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = beta * y[iy];
+		    iy += incy;
+		}
+	}
+    }
+    
+    if (alpha == 0.) return 0;
+
+    if ( notran ) {
+	/* Form  y := alpha*A*x + y. */
+	jx = kx;
+	if (incy == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		if (x[jx] != 0.) {
+		    temp = alpha * x[jx];
+		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+			irow = Astore->rowind[i];
+			y[irow] += temp * Aval[i];
+		    }
+		}
+		jx += incx;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    } else {
+	/* Form  y := alpha*A'*x + y. */
+	jy = ky;
+	if (incx == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		temp = zero;
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    irow = Astore->rowind[i];
+		    temp += Aval[i] * x[irow];
+		}
+		y[jy] += alpha * temp;
+		jy += incy;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    }
+    return 0;
+} /* sp_sgemv_dist */
diff --git a/SRC/ssp_blas3_dist.c b/SRC/ssp_blas3_dist.c
new file mode 100644
index 00000000..4bf00e01
--- /dev/null
+++ b/SRC/ssp_blas3_dist.c
@@ -0,0 +1,138 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Sparse BLAS3, using some dense BLAS3 operations
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * </pre>
+ */
+
+/*
+ * File name:		ssp_blas3_dist.c
+ * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
+ */
+
+#include "superlu_sdefs.h"
+
+/*! \brief
+
+<pre>
+  Purpose   
+    =======   
+
+    sp_sgemm_dist performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) float
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    B      - float array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - float array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.  
+</pre> 
+*/
+int
+sp_sgemm_dist(char *transa, int n, float alpha, SuperMatrix *A,
+	      float *b, int ldb,  float beta,
+	      float *c, int ldc)
+{
+
+    int    incx = 1, incy = 1;
+    int    j;
+
+    for (j = 0; j < n; ++j) {
+	sp_sgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
+    }
+    return 0;
+}
diff --git a/SRC/sstatic_schedule.c b/SRC/sstatic_schedule.c
new file mode 100644
index 00000000..ebb416f5
--- /dev/null
+++ b/SRC/sstatic_schedule.c
@@ -0,0 +1,984 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs static scheduling for the look-ahead factorization algorithm.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified: February 20, 2020, changed to be precision-dependent.
+ *
+ * Reference:
+ * 
+ * </pre>
+ */
+
+#include "superlu_sdefs.h"
+
+#ifdef ISORT
+extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2);
+extern void isort1 (int_t N, int_t * ARRAY);
+
+#else
+
+static int
+superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
+int
+sstatic_schedule(superlu_dist_options_t * options, int m, int n, 
+		sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat,
+		int_t *perm_c_supno, int_t *iperm_c_supno, int *info)
+{
+/* 
+ * Arguments
+ * =========
+ *
+ * perm_c_supno (output) 
+ *      perm_c_supno[k] = j means at the k-th step of elimination, the j-th
+ *      supernode is chosen.
+ * iperm_c_supno (output), inverse of perm_c_supno[]
+ *      iperm_c_supno[j] = k means the j-th supernode will be scheduled
+ *      at the k-th step of elimination.
+ * 
+ */
+    int_t *xsup;
+    int_t  i, ib, jb, lb, nlb, il, iu;
+    int_t Pc, Pr;
+    int iam, krow, yourcol, mycol, myrow; 
+    int j, k, nsupers;  /* k - current panel to work on */
+    int_t *index;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int ncb, nrb, p, pr, pc, nblocks;
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows,
+        *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l,
+        *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno;
+    float edag_supno_l_bytes;
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+
+    int iword = sizeof (int_t);
+
+    /* Test the input parameters. */
+    *info = 0;
+    if (m < 0) *info = -2;
+    else if (n < 0) *info = -3;
+    if (*info) {
+        pxerr_dist ("static_schedule", grid, -*info);
+        return (-1);
+    }
+
+    /* Quick return if possible. */
+    if (m == 0 || n == 0) return 0;
+ 
+    /* 
+     * Initialization.  
+     */
+    iam = grid->iam;
+    Pc = grid->npcol; 
+    Pr = grid->nprow;
+    myrow = MYROW (iam, grid);
+    mycol = MYCOL (iam, grid);
+    nsupers = Glu_persist->supno[n - 1] + 1;
+    xsup = Glu_persist->xsup;
+    nblocks = 0;
+    ncb = nsupers / Pc;
+    nrb = nsupers / Pr;
+
+#if ( DEBUGlevel >= 1 ) 
+    print_memorylog(stat, "before static schedule");
+#endif
+
+    /* ================================================== *
+     * static scheduling of j-th step of LU-factorization *
+     * ================================================== */
+    if (options->lookahead_etree == YES &&  /* use e-tree of symmetrized matrix and */
+        (options->ParSymbFact == NO ||  /* 1) symmetric fact with serial symbolic, or */
+         (options->SymPattern == YES && /* 2) symmetric pattern, and                  */
+          options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */
+
+        /* if symmetric pattern or using e-tree of |A^T|+|A|,
+           then we can use a simple tree structure for static schduling */
+
+        if (options->ParSymbFact == NO) {
+            /* Use the etree computed from serial symb. fact., and turn it
+               into supernodal tree.  */
+            int_t *etree = LUstruct->etree;
+#if ( PRNTlevel>=1 )
+            if (grid->iam == 0)
+                printf (" === using column e-tree ===\n");
+#endif
+
+            /* look for the first off-diagonal blocks */
+            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	    log_memory(nsupers * iword, stat);
+
+            for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers;
+
+            for (j = 0, lb = 0; lb < nsupers; lb++) {
+                for (k = 0; k < SuperSize (lb); k++) {
+                    jb = Glu_persist->supno[etree[j + k]];
+                    if (jb != lb)
+                        etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb);
+                }
+                j += SuperSize (lb);
+            }
+        } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */
+            /* Compute an "etree" based on struct(L),
+               assuming struct(U) = struct(L').   */
+#if ( PRNTlevel>=1 )
+            if (grid->iam == 0)
+                printf (" === using supernodal e-tree ===\n");
+#endif
+
+            /* find the first block in each supernodal-column of local L-factor */
+            etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	    log_memory(nsupers * iword, stat);
+
+            for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers;
+            for (lb = 0; lb < ncb; lb++) {
+                jb = lb * grid->npcol + mycol;
+                index = Llu->Lrowind_bc_ptr[lb];
+                if (index) {   /* Not an empty column */
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW (jb, grid);
+                    if (krow == myrow) {  /* skip the diagonal block */
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if (i > 0)
+                    {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+
+                    for (j = 0; j < i; j++)
+                    {
+                        etree_supno_l[jb] =
+                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+            if (mycol < nsupers % grid->npcol) {
+                jb = ncb * grid->npcol + mycol;
+                index = Llu->Lrowind_bc_ptr[ncb];
+                if (index) {     /* Not an empty column */
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW (jb, grid);
+                    if (krow == myrow) { /* skip the diagonal block */
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if (i > 0) {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    for (j = 0; j < i; j++) {
+                        etree_supno_l[jb] =
+                            SUPERLU_MIN (etree_supno_l[jb], index[k]);
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+
+            /* form global e-tree */
+            etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+
+            MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t,
+                           MPI_MIN, grid->comm);
+
+            SUPERLU_FREE (etree_supno_l);
+        }
+
+        /* initialize number of children for each node */
+        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+        for (i = 0; i < nsupers; i++) num_child[i] = 0;
+        for (i = 0; i < nsupers; i++)
+            if (etree_supno[i] != nsupers)  num_child[etree_supno[i]]++;
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for (i = 0; i < nsupers; i++) {
+            if (num_child[i] == 0) {
+                ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
+                nnodes++;
+
+                if (nnodes == 1) {
+                    head = ptr;
+                    tail = ptr;
+                } else {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+
+        while (nnodes > 0) {
+            ptr = head;
+            j = ptr->id;
+            head = ptr->next;
+            perm_c_supno[i] = j;
+            SUPERLU_FREE (ptr);
+            i++;
+            nnodes--;
+
+            if (etree_supno[j] != nsupers) {
+                num_child[etree_supno[j]]--;
+                if (num_child[etree_supno[j]] == 0) {
+                    nnodes++;
+
+                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                    ptr->id = etree_supno[j];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id ); */
+                    if (nnodes == 1) {
+                        head = ptr;
+                        tail = ptr;
+                    } else {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" ); */
+        }
+        SUPERLU_FREE (num_child);
+        SUPERLU_FREE (etree_supno);
+	log_memory(-2 * nsupers * iword, stat);
+
+    } else {         /* Unsymmetric pattern */
+
+        /* Need to process both L- and U-factors, use the symmetrically
+           pruned graph of L & U instead of tree (very naive implementation) */
+        int nrbp1 = nrb + 1;
+	float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes;
+
+        /* allocate some workspace */
+        if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int))))
+            ABORT ("Malloc fails for sendcnts[].");
+	log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat);
+
+        sdispls = &sendcnts[Pr * Pc];
+        recvcnts = &sdispls[Pr * Pc];
+        rdispls = &recvcnts[Pr * Pc];
+        srows = &rdispls[Pr * Pc];
+        rrows = &srows[Pr * Pc * nrbp1];
+
+        myrow = MYROW (iam, grid);
+#if ( PRNTlevel>=1 )
+        if (grid->iam == 0)
+            printf (" === using DAG ===\n");
+#endif
+
+        /* send supno block of local U-factor to a processor *
+         * who owns the corresponding block of L-factor      */
+
+        /* srows   : # of block to send to a processor from each supno row */
+        /* sendcnts: total # of blocks to send to a processor              */
+        for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0;
+        for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0;
+
+        /* sending blocks of U-factors corresponding to L-factors */
+        /* count the number of blocks to send */
+        for (lb = 0; lb < nrb; ++lb) {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[lb];
+
+            if (index) {         /* Not an empty row */
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    sendcnts[p]++;
+                    srows[p * nrbp1 + lb]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        if (myrow < nsupers % grid->nprow) {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[nrb];
+
+            if (index) {         /* Not an empty row */
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    sendcnts[p]++;
+                    srows[p * nrbp1 + nrb]++;
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        /* insert blocks to send */
+        sdispls[0] = 0;
+        for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+        if (!(blocks = intMalloc_dist (nblocks)))
+            ABORT ("Malloc fails for blocks[].");
+	log_memory( nblocks * iword, stat );
+
+        for (lb = 0; lb < nrb; ++lb) {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[lb];
+
+            if (index) {       /* Not an empty row */
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        if (myrow < nsupers % grid->nprow) {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            index = Llu->Ufstnz_br_ptr[nrb];
+
+            if (index) {       /* Not an empty row */
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j) {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p]++;
+
+                    k += UB_DESCRIPTOR + SuperSize (index[k]);
+                }
+            }
+        }
+
+        /* communication */
+        MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+        MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm);
+
+	log_memory( -(nblocks * iword), stat );  /* blocks[] to be freed soon */
+
+        nblocks = recvcnts[0];
+        rdispls[0] = sdispls[0] = 0;
+        for (p = 1; p < Pr * Pc; p++) {
+            rdispls[p] = rdispls[p - 1] + recvcnts[p - 1];
+            sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+            nblocks += recvcnts[p];
+        }
+
+        if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[].");
+	log_memory( nblocks * iword, stat );
+
+        MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts,
+                       rdispls, mpi_int_t, grid->comm);
+
+        SUPERLU_FREE (blocks); /* memory logged before */
+
+	
+        /* store the received U-blocks by rows */
+        nlb = nsupers / Pc;
+        if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[].");
+        if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[].");
+
+	Ublock_bytes = nblocks * iword;
+	Urows_bytes = (1 + nlb) * iword;
+	log_memory( Ublock_bytes + Urows_bytes, stat );
+
+        k = 0;
+        for (jb = 0; jb < nlb; jb++) {
+            j = jb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[jb] = 0;
+
+            for (pc = 0; pc < Pc; pc++) {
+                p = pr * Pc + pc; /* the processor owning this block of U-factor */
+
+                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
+                     i++) {
+                    Ublock[k] = blockr[i];
+                    k++;
+                    Urows[jb]++;
+                }
+                rdispls[p] += rrows[p * nrbp1 + lb];
+            }
+            /* sort by the column indices to make things easier for later on */
+
+#ifdef ISORT
+            isort1 (Urows[jb], &(Ublock[k - Urows[jb]]));
+#else
+            qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        if (mycol < nsupers % grid->npcol) {
+            j = nlb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[nlb] = 0;
+
+            for (pc = 0; pc < Pc; pc++) {
+                p = pr * Pc + pc;
+                for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb];
+                     i++) {
+                    Ublock[k] = blockr[i];
+                    k++;
+                    Urows[nlb]++;
+                }
+                rdispls[p] += rrows[p * nrb + lb];
+            }
+#ifdef ISORT
+            isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]]));
+#else
+            qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        SUPERLU_FREE (blockr);
+	log_memory( -nblocks * iword, stat );
+
+        /* sort the block in L-factor */
+        nblocks = 0;
+        for (lb = 0; lb < ncb; lb++) {
+            jb = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {        /* Not an empty column */
+                nblocks += index[0];
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            jb = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {       /* Not an empty column */
+                nblocks += index[0];
+            }
+        }
+
+        if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[].");
+        if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[].");
+
+	Lblock_bytes = nblocks * iword;
+	Lrows_bytes = (1 + ncb) * iword;
+	log_memory(Lblock_bytes + Lrows_bytes, stat);
+
+        for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0;
+        nblocks = 0;
+        for (lb = 0; lb < ncb; lb++) {
+            Lrows[lb] = 0;
+
+            jb = lb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[lb];
+            if (index) {      /* Not an empty column */
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW (jb, grid);
+                if (krow == myrow)  /* skip the diagonal block */
+                {
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+
+                for (j = 0; j < i; j++) {
+                    Lblock[nblocks] = index[k];
+                    Lrows[lb]++;
+                    nblocks++;
+
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+#ifdef ISORT
+            isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]]));
+#else
+            qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]),
+                   sizeof (int_t), &superlu_sort_perm);
+#endif
+        }
+        if (mycol < nsupers % grid->npcol) {
+            Lrows[ncb] = 0;
+            jb = ncb * Pc + mycol;
+            index = Llu->Lrowind_bc_ptr[ncb];
+            if (index) {       /* Not an empty column */
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW (jb, grid);
+                if (krow == myrow) { /* skip the diagonal block */
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+                for (j = 0; j < i; j++) {
+                    Lblock[nblocks] = index[k];
+                    Lrows[ncb]++;
+                    nblocks++;
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+#ifdef ISORT
+                isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]));
+#else
+                qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]),
+                       sizeof (int_t), &superlu_sort_perm);
+#endif
+            }
+        }
+
+        /* look for the first local symmetric nonzero block match */
+        if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[].");
+        if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[].");
+
+	log_memory( 2 * nsupers * iword, stat );
+
+        for (lb = 0; lb < nsupers; lb++)
+            sf_block_l[lb] = nsupers;
+        i = 0;
+        j = 0;
+        for (jb = 0; jb < nlb; jb++) {
+            if (Urows[jb] > 0) {
+                ib = i + Urows[jb];
+                lb = jb * Pc + mycol;
+                for (k = 0; k < Lrows[jb]; k++) {
+                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
+                        i++;
+
+                    if (Ublock[i] == Lblock[j]) {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[jb] - k);
+                        k = Lrows[jb];
+                    } else {
+                        j++;
+                    }
+                }
+                i = ib;
+            } else {
+                j += Lrows[jb];
+            }
+        }
+        if (mycol < nsupers % grid->npcol) {
+            if (Urows[nlb] > 0) {
+                ib = i + Urows[nlb];
+                lb = nlb * Pc + mycol;
+                for (k = 0; k < Lrows[nlb]; k++) {
+                    while (Ublock[i] < Lblock[j] && i + 1 < ib)
+                        i++;
+
+                    if (Ublock[i] == Lblock[j])
+                    {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[nlb] - k);
+                        k = Lrows[nlb];
+                    }
+                    else
+                    {
+                        j++;
+                    }
+                }
+                i = ib;
+            } else {
+                j += Lrows[nlb];
+            }
+        }
+
+        /* compute the first global symmetric matchs */
+        MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN,
+                       grid->comm);
+        SUPERLU_FREE (sf_block_l);
+	log_memory( -nsupers * iword, stat );
+
+        /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */
+        if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[].");
+        if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[].");
+	log_memory( 2 * nsupers * iword, stat );
+
+        for (lb = 0; lb < nsupers; lb++)  nnodes_l[lb] = 0;
+        for (lb = 0; lb < nsupers; lb++)  nnodes_u[lb] = 0;
+
+        nblocks = 0;
+        /* from U-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++) {
+            lb = jb * Pc + mycol;
+            ib = i + Urows[jb];
+            while (i < ib) {
+                if (Ublock[i] <= sf_block[lb]) {
+                    nnodes_u[lb]++;
+                    i++;
+                    nblocks++;
+                } else {     /* get out */
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if (mycol < nsupers % grid->npcol) {
+            lb = nlb * Pc + mycol;
+            ib = i + Urows[nlb];
+            while (i < ib) {
+                if (Ublock[i] <= sf_block[lb]) {
+                    nnodes_u[lb]++;
+                    i++;
+                    nblocks++;
+                } else {         /* get out */
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+        /* from L-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++) {
+            lb = jb * Pc + mycol;
+            ib = i + Lrows[jb];
+            while (i < ib) {
+                if (Lblock[i] < sf_block[lb]) {
+                    nnodes_l[lb]++;
+                    i++;
+                    nblocks++;
+                } else {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if (mycol < nsupers % grid->npcol) {
+            lb = nlb * Pc + mycol;
+            ib = i + Lrows[nlb];
+            while (i < ib) {
+                if (Lblock[i] < sf_block[lb]) {
+                    nnodes_l[lb]++;
+                    i++;
+                    nblocks++;
+                } else {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+#ifdef USE_ALLGATHER
+        /* insert local nodes in DAG */
+        if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks)))
+            ABORT ("Malloc fails for edag_supno_l[].");
+	edag_supno_l_bytes = (nsupers + nblocks) * iword;
+	log_memory(edag_supno_l_bytes, stat);
+
+        iu = il = nblocks = 0;
+        for (lb = 0; lb < nsupers; lb++) {
+            j = lb / Pc;
+            pc = lb % Pc;
+
+            edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
+            nblocks++;
+            if (mycol == pc) {
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for (jb = 0; jb < nnodes_u[lb]; jb++) {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++;
+                    nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for (jb = 0; jb < nnodes_l[lb]; jb++) {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++;
+                    nblocks++;
+                }
+                il = ib;
+            }
+        }
+        SUPERLU_FREE (nnodes_u);
+	log_memory(-nsupers * iword, stat);
+
+        /* form global DAG on each processor */
+        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT,
+                       grid->comm);
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for (lb = 1; lb < Pc * Pr; lb++) {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
+	log_memory(nblocks * iword, stat);
+
+        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+        SUPERLU_FREE (edag_supno_l);
+	log_memory(-edag_supno_l_bytes, stat);
+
+        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
+            ABORT ("Malloc fails for edag_supno[].");
+	log_memory(nsupers * iword, stat);
+
+        k = 0;
+        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            for (lb = 0; lb < nsupers; lb++) {
+                nnodes_l[lb] += recvbuf[k];
+                k += (1 + recvbuf[k]);
+            }
+        }
+        for (lb = 0; lb < nsupers; lb++) {
+            if (nnodes_l[lb] > 0)
+                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
+                    ABORT ("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            for (lb = 0; lb < nsupers; lb++) {
+                jb = k + recvbuf[k] + 1;
+                k++;
+                for (; k < jb; k++) {
+                    edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                    nnodes_l[lb]++;
+                }
+            }
+        }
+        SUPERLU_FREE (recvbuf);
+	log_memory(-nblocks * iword, stat);
+
+#else   /* not USE_ALLGATHER */
+        int nlsupers = nsupers / Pc;
+        if (mycol < nsupers % Pc) nlsupers++;
+
+        /* insert local nodes in DAG */
+        if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks)))
+            ABORT ("Malloc fails for edag_supno_l[].");
+	edag_supno_l_bytes = (nlsupers + nblocks) * iword;
+	log_memory(edag_supno_l_bytes, stat);
+
+        iu = il = nblocks = 0;
+        for (lb = 0; lb < nsupers; lb++) {
+            j = lb / Pc;
+            pc = lb % Pc;
+            if (mycol == pc) {
+                edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb];
+                nblocks++;
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for (jb = 0; jb < nnodes_u[lb]; jb++) {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++;
+                    nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for (jb = 0; jb < nnodes_l[lb]; jb++) {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++;
+                    nblocks++;
+                }
+                il = ib;
+            } else if (nnodes_l[lb] + nnodes_u[lb] != 0)
+                printf (" # %d: nnodes[" IFMT "]=" IFMT "+" IFMT "\n",
+			grid->iam, lb, nnodes_l[lb], nnodes_u[lb]);
+        }
+        SUPERLU_FREE (nnodes_u);
+	log_memory(-nsupers * iword, stat);
+
+        /* form global DAG on each processor */  
+        MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm);
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for (lb = 1; lb < Pc * Pr; lb++) {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[].");
+	log_memory(nblocks * iword, stat);
+
+        MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm);
+
+        SUPERLU_FREE (edag_supno_l);
+	log_memory(-edag_supno_l_bytes, stat);
+
+        if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *))))
+            ABORT ("Malloc fails for edag_supno[].");
+	log_memory(nsupers * sizeof(int_t *), stat);
+
+        k = 0;
+        for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            yourcol = MYCOL (p, grid);
+
+            for (lb = 0; lb < nsupers; lb++) {
+                j = lb / Pc;
+                pc = lb % Pc;
+                if (yourcol == pc) {
+                    nnodes_l[lb] += recvbuf[k];
+                    k += (1 + recvbuf[k]);
+                }
+            }
+        }
+        for (lb = 0; lb < nsupers; lb++) {
+            if (nnodes_l[lb] > 0)
+                if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb])))
+                    ABORT ("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for (p = 0; p < Pc * Pr; p++) {
+            yourcol = MYCOL (p, grid);
+
+            for (lb = 0; lb < nsupers; lb++) {
+                j = lb / Pc;
+                pc = lb % Pc;
+                if (yourcol == pc)
+                {
+                    jb = k + recvbuf[k] + 1;
+                    k++;
+                    for (; k < jb; k++)
+                    {
+                        edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                        nnodes_l[lb]++;
+                    }
+                }
+            }
+        }
+        SUPERLU_FREE (recvbuf);
+	log_memory( -nblocks * iword , stat);
+
+#endif  /* end USE_ALL_GATHER */
+
+        /* initialize the num of child for each node */
+        num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+        for (i = 0; i < nsupers; i++) num_child[i] = 0;
+        for (i = 0; i < nsupers; i++) {
+            for (jb = 0; jb < nnodes_l[i]; jb++) {
+                num_child[edag_supno[i][jb]]++;
+            }
+        }
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for (i = 0; i < nsupers; i++) {
+            if (num_child[i] == 0) {
+                ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */
+                nnodes++;
+
+                if (nnodes == 1) {
+                    head = ptr;
+                    tail = ptr;
+                } else {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+
+        while (nnodes > 0) {
+            /*printf( "=== pop %d (%d) ===\n",head->id,i ); */
+            ptr = head;
+            j = ptr->id;
+            head = ptr->next;
+
+            perm_c_supno[i] = j;
+            SUPERLU_FREE (ptr);
+            i++;
+            nnodes--;
+
+            for (jb = 0; jb < nnodes_l[j]; jb++) {
+                num_child[edag_supno[j][jb]]--;
+                if (num_child[edag_supno[j][jb]] == 0) {
+                    nnodes++;
+
+                    ptr = SUPERLU_MALLOC (sizeof (etree_node));
+                    ptr->id = edag_supno[j][jb];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id ); */
+                    if (nnodes == 1) {
+                        head = ptr;
+                        tail = ptr;
+                    } else {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" ); */
+        }
+        for (lb = 0; lb < nsupers; lb++)
+            if (nnodes_l[lb] > 0)  SUPERLU_FREE (edag_supno[lb]);
+
+        SUPERLU_FREE (num_child);
+        SUPERLU_FREE (edag_supno);
+        SUPERLU_FREE (nnodes_l);
+        SUPERLU_FREE (sf_block);
+        SUPERLU_FREE (sendcnts);
+
+	log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat);
+
+        SUPERLU_FREE (Ublock);
+        SUPERLU_FREE (Urows);
+        SUPERLU_FREE (Lblock);
+        SUPERLU_FREE (Lrows);
+	log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat);
+    }
+    /* ======================== *
+     * end of static scheduling *
+     * ======================== */
+
+    for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb;
+
+#if ( DEBUGlevel >= 1 )
+    print_memorylog(stat, "after static schedule");
+    check_perm_dist("perm_c_supno", nsupers, perm_c_supno);
+    check_perm_dist("iperm_c_supno", nsupers, iperm_c_supno);
+#endif
+
+    return 0;
+} /* STATIC_SCHEDULE */
+
diff --git a/SRC/ssuperlu_blas.c b/SRC/ssuperlu_blas.c
new file mode 100644
index 00000000..5d820665
--- /dev/null
+++ b/SRC/ssuperlu_blas.c
@@ -0,0 +1,123 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_sdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_sgemm(const char *transa, const char *transb,
+                  int m, int n, int k, float alpha, float *a,
+                  int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    sgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return sgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_strsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const float alpha, const float *a,
+                  const int lda, float *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    strsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return strsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_sger(const int m, const int n, const float alpha,
+                 const float *x, const int incx, const float *y,
+                 const int incy, float *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    sger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_sscal(const int n, const float alpha, float *x, const int incx)
+{
+    sscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_saxpy(const int n, const float alpha,
+    const float *x, const int incx, float *y, const int incy)
+{
+    saxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_sgemv(const char *trans, const int m,
+                  const int n, const float alpha, const float *a,
+                  const int lda, const float *x, const int incx,
+                  const float beta, float *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_strsv(char *uplo, char *trans, char *diag,
+                  int n, float *a, int lda, float *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
new file mode 100644
index 00000000..dbdabe9d
--- /dev/null
+++ b/SRC/ssuperlu_gpu.cu
@@ -0,0 +1,1780 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * </pre>
+ */
+
+//#define GPU_DEBUG
+
+#include "superlu_defs.h"
+
+#undef Reduce
+
+//#include <thrust/system/gpu/detail/cub/cub.cuh>
+
+#include "slustruct_gpu.h"
+
+
+//extern "C" {
+//	void cblas_daxpy(const int N, const double alpha, const double *X,
+//	                 const int incX, double *Y, const int incY);
+//}
+
+// gpublasStatus_t checkGPUblas(gpublasStatus_t result)
+// {
+// #if defined(DEBUG) || defined(_DEBUG)
+// 	if (result != GPUBLAS_STATUS_SUCCESS)
+// 	{
+// 		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result));
+// 		assert(result == GPUBLAS_STATUS_SUCCESS);
+// 	}
+// #endif
+// 	return result;
+// }
+
+
+// #define UNIT_STRIDE
+
+#if 0  ////////// this routine is not used anymore
+__device__ inline
+void device_scatter_l (int_t thread_id,
+                       int_t nsupc, int_t temp_nbrow,
+                       int_t *usub, int_t iukp, int_t klst,
+                       float *nzval, int_t ldv,
+                       float *tempv, int_t nbrow,
+                       // int_t *indirect2_thread
+                       int *indirect2_thread
+                      )
+{
+
+
+	int_t segsize, jj;
+
+	for (jj = 0; jj < nsupc; ++jj)
+	{
+		segsize = klst - usub[iukp + jj];
+		if (segsize)
+		{
+			if (thread_id < temp_nbrow)
+			{
+
+#ifndef UNIT_STRIDE
+				nzval[indirect2_thread[thread_id]] -= tempv[thread_id];
+#else
+				nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/
+#endif
+			}
+			tempv += nbrow;
+		}
+		nzval += ldv;
+	}
+}
+#endif ///////////// not used
+
+//#define THREAD_BLOCK_SIZE  256  /* Sherry: was 192. should be <= MAX_SUPER_SIZE */
+
+__device__ inline
+void sdevice_scatter_l_2D (int thread_id,
+                          int nsupc, int temp_nbrow,
+                          int_t *usub, int iukp, int_t klst,
+                          float *nzval, int ldv,
+                          const float *tempv, int nbrow,
+                          int *indirect2_thread,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+    if ( thread_id < temp_nbrow * ColPerBlock )	{
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#define UNROLL_ITER 8
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+   	    i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x];
+	    nzval[i] -= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+/* Sherry: this routine is not used */
+#if 0 //////////////////////////////////////////////
+__global__
+void cub_scan_test(void)
+{
+	int thread_id = threadIdx.x;
+	typedef cub::BlockScan<int, MAX_SUPER_SIZE > BlockScan; /*1D int data type*/
+
+	__shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+	__shared__ int IndirectJ1[MAX_SUPER_SIZE];
+	__shared__ int IndirectJ2[MAX_SUPER_SIZE];
+
+	if (thread_id < MAX_SUPER_SIZE)
+	{
+		IndirectJ1[thread_id] = (thread_id + 1) % 2;
+	}
+
+	__syncthreads();
+	if (thread_id < MAX_SUPER_SIZE)
+		BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+	if (thread_id < MAX_SUPER_SIZE)
+		printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+}
+#endif  /////////////////////////////////// not used
+
+
+__device__ inline
+void device_scatter_u_2D (int thread_id,
+                          int temp_nbrow,  int nsupc,
+                          float * ucol,
+                          int_t * usub, int iukp,
+                          int_t ilst, int_t klst,
+                          int_t * index, int iuip_lib,
+                          float * tempv, int nbrow,
+                          int *indirect,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ1,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+
+    if ( thread_id < temp_nbrow * ColPerBlock )
+    {    
+	/* 1D threads are logically arranged in 2D shape. */
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+           i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x];
+	    ucol[i] -= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+__global__
+void Scatter_GPU_kernel(
+    int_t streamId,
+    int_t ii_st, int_t ii_end,
+    int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */
+    int_t klst,
+    int_t jj0,   /* 0 on entry */
+    int_t nrows, int_t ldt, int_t npcol, int_t nprow,
+    sLUstruct_gpu_t * A_gpu)
+{
+
+	/* initializing pointers */
+	int_t *xsup = A_gpu->xsup;
+	int_t *UrowindPtr = A_gpu->UrowindPtr;
+	int_t *UrowindVec = A_gpu->UrowindVec;
+	int_t *UnzvalPtr = A_gpu->UnzvalPtr;
+	float *UnzvalVec = A_gpu->UnzvalVec;
+	int_t *LrowindPtr = A_gpu->LrowindPtr;
+	int_t *LrowindVec = A_gpu->LrowindVec;
+	int_t *LnzvalPtr = A_gpu->LnzvalPtr;
+	float *LnzvalVec = A_gpu->LnzvalVec;
+	float *bigV = A_gpu->scubufs[streamId].bigV;
+	local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec;
+	local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec;
+	int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr;
+	int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info;
+	int_t *lsub  = A_gpu->scubufs[streamId].lsub;
+	int_t *usub  = A_gpu->scubufs[streamId].usub;
+
+	/* thread block assignment: this thread block is
+	   assigned to block (lb, j) in 2D grid */
+	int lb = blockIdx.x + ii_st;
+	int j  = blockIdx.y + jj_st;
+	
+	extern __shared__ int s[];
+	int* indirect_lptr = s;  /* row-wise */
+	int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */
+	int* IndirectJ1= (int*) &indirect2_thread[ldt];    /* column-wise */
+	int* IndirectJ3= (int*) &IndirectJ1[ldt];    /* column-wise */
+	//int THREAD_BLOCK_SIZE =ldt; 
+	
+	int* pfxStorage = (int*) &IndirectJ3[ldt];
+	
+	int thread_id = threadIdx.x;
+
+	int iukp = Ublock_info[j].iukp;
+	int jb = Ublock_info[j].jb;
+	int nsupc = SuperSize (jb);
+	int ljb = jb / npcol;
+
+	typedef int pfx_dtype ;
+        extern  __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n);
+
+	float *tempv1;
+	if (jj_st == jj0)
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + Ublock_info[j - 1].full_u_cols * nrows;
+	}
+	else
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + (Ublock_info[j - 1].full_u_cols -
+		                   Ublock_info[jj_st - 1].full_u_cols) * nrows;
+	}
+
+	/* # of nonzero columns in block j  */
+	int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols
+	               : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols);
+	int cum_ncol = (j == 0) ? 0	
+					: Ublock_info[j - 1].full_u_cols;
+
+	int lptr = Remain_info[lb].lptr;
+	int ib   = Remain_info[lb].ib;
+	int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */
+	lptr += LB_DESCRIPTOR;
+
+	int_t cum_nrow;
+	if (ii_st == 0)
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+	}
+	else
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow);
+	}
+
+	tempv1 += cum_nrow;
+
+	if (ib < jb)  /*scatter U code */
+	{
+		int ilst = FstBlockC (ib + 1);
+		int lib =  ib / nprow;   /* local index of row block ib */
+		int_t *index = &UrowindVec[UrowindPtr[lib]];
+
+		int num_u_blocks = index[0];
+
+		int ljb = (jb) / npcol; /* local index of column block jb */
+
+		/* Each thread is responsible for one block column */
+		__shared__ int ljb_ind;
+		/*do a search ljb_ind at local row lib*/
+		int blks_per_threads = CEILING(num_u_blocks, blockDim.x);
+		// printf("blockDim.x =%d \n", blockDim.x);
+		
+		for (int i = 0; i < blks_per_threads; ++i)
+			/* each thread is assigned a chunk of consecutive U blocks to search */
+		{
+			/* only one thread finds the block index matching ljb */
+			if (thread_id * blks_per_threads + i < num_u_blocks &&
+			        local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb)
+			{
+				ljb_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip;
+		int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip;
+		iuip_lib += UB_DESCRIPTOR;
+		float *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]];
+		float *ucol = &Unzval_lib[ruip_lib];
+
+		if (thread_id < temp_nbrow) /* row-wise */
+		{
+		    /* cyclically map each thread to a row */
+		    indirect_lptr[thread_id] = (int) lsub[lptr + thread_id];
+		}
+
+		/* column-wise: each thread is assigned one column */
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		/* indirectJ3[j] == kk means the j-th nonzero segment
+		   points to column kk in this supernode */
+
+		__syncthreads();
+
+		/* threads are divided into multiple columns */
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		// if (thread_id < blockDim.x)
+		// 	IndirectJ1[thread_id] = 0;
+		if (thread_id < ldt)
+			IndirectJ1[thread_id] = 0;
+
+		if (thread_id < blockDim.x)
+		{
+		    if (thread_id < nsupc)
+		    {
+			/* fstnz subscript of each column in the block */
+			IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst;
+		    }
+		}
+
+		/* perform an inclusive block-wide prefix sum among all threads */
+		__syncthreads();
+		
+		incScan(IndirectJ1, pfxStorage, nsupc);
+		
+		__syncthreads();
+
+		device_scatter_u_2D (
+		    thread_id,
+		    temp_nbrow,  nsupc,
+		    ucol,
+		    usub, iukp,
+		    ilst, klst,
+		    index, iuip_lib,
+		    tempv1, nrows,
+		    indirect_lptr,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ1,
+		    IndirectJ3 );
+
+	}
+	else     /* ib >= jb, scatter L code */
+	{
+
+		int rel;
+		float *nzval;
+		int_t *index = &LrowindVec[LrowindPtr[ljb]];
+		int num_l_blocks = index[0];
+		int ldv = index[1];
+
+		int fnz = FstBlockC (ib);
+		int lib = ib / nprow;
+
+		__shared__ int lib_ind;
+		/*do a search lib_ind for lib*/
+		int blks_per_threads = CEILING(num_l_blocks, blockDim.x);
+		for (int i = 0; i < blks_per_threads; ++i)
+		{
+			if (thread_id * blks_per_threads + i < num_l_blocks &&
+			        local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib)
+			{
+				lib_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj;
+		int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj;
+		lptrj += LB_DESCRIPTOR;
+		int dest_nbrow = index[lptrj - 1];
+
+		if (thread_id < dest_nbrow)
+		{
+		    rel = index[lptrj + thread_id] - fnz;
+		    indirect_lptr[rel] = thread_id;
+		}
+		__syncthreads();
+
+		/* can be precalculated */
+		if (thread_id < temp_nbrow)
+		{
+			rel = lsub[lptr + thread_id] - fnz;
+			indirect2_thread[thread_id] = indirect_lptr[rel];
+		}
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		__syncthreads();
+
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj;
+		sdevice_scatter_l_2D(
+		    thread_id,
+		    nsupc, temp_nbrow,
+		    usub, iukp, klst,
+		    nzval, ldv,
+		    tempv1, nrows, indirect2_thread,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ3);
+	} /* end else ib >= jb */
+
+} /* end Scatter_GPU_kernel */
+
+
+#define GPU_2D_SCHUDT  /* Not used */
+
+int sSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */
+    int_t nub,    /* jj_cpu on entry, pointing to the end of the Phi part */
+    int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,    /* num_u_blks_hi */
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    ssluGPU_t *sluGPU, gridinfo_t *grid
+)
+{
+    int SCATTER_THREAD_BLOCK_SIZE=512;
+
+	sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu;
+	sLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu;
+	int_t nprow = grid->nprow;
+	int_t npcol = grid->npcol;
+
+	gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId];
+	gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId];
+	int_t * lsub = A_gpu->scubufs[streamId].lsub_buf;
+	int_t * usub = A_gpu->scubufs[streamId].usub_buf;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host;
+	float * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host;
+	float * bigU = A_gpu->scubufs[streamId].bigU_host;
+
+	A_gpu->isOffloaded[k0] = 1;
+	/* start by sending data to  */
+	int_t *xsup = A_gpu->xsup_host;
+	int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols;
+	// if(nub<1) return;
+	int_t ncols  = Ublock_info[nub - 1].full_u_cols - col_back;
+
+	/* Sherry: can get max_super_size from sp_ienv(3) */
+	int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment
+	int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far
+	int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the
+					 j-th nonzero segment points
+					 to column k in this supernode */
+	/* calculate usub_indirect */
+	for (int jj = jj_cpu; jj < nub; ++jj)
+	{
+	    int_t iukp = Ublock_info[jj].iukp;
+	    int_t jb = Ublock_info[jj].jb;
+	    int_t nsupc = SuperSize (jb);
+	    int_t addr = (jj == 0) ? 0
+	             : Ublock_info[jj - 1].full_u_cols - col_back;
+
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	indirectJ1[kk] = 0;
+	    }
+
+	    for (int_t kk = 0; kk < nsupc; ++kk)
+	    {
+	 	indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1;
+	    }
+
+	    /*prefix sum - indicates # of nonzero segments up to column kk */
+	    indirectJ2[0] = indirectJ1[0];
+	    for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	 	indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk];
+	    }
+
+	    /* total number of nonzero segments in this supernode */
+	    int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE
+
+	    /* compactation */
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	if (indirectJ1[kk]) /* kk is a nonzero segment */
+		{
+		    /* indirectJ3[j] == kk means the j-th nonzero segment
+		       points to column kk in this supernode */
+		    indirectJ3[indirectJ2[kk] - 1] = kk;
+		}
+	    }
+
+    	    for (int i = 0; i < nnz_col; ++i)
+	    {
+	        /* addr == total # of full columns before current block jj */
+		A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i];
+	    }
+	} /* end for jj ... calculate usub_indirect */
+
+	//printf("sSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout);
+
+	/*sizeof RemainLbuf = Rnbuf*knsupc */
+	double tTmp = SuperLU_timer_();
+	gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream);
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3,
+	                          A_gpu->scubufs[streamId].usub_IndirectJ3_host,
+	                          ncols * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff,
+	                          Remain_lbuf_send_size * sizeof(float),
+	                          gpuMemcpyHostToDevice, FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU,
+	                          bigu_send_size * sizeof(float),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info,
+	                          RemainBlk * sizeof(Remain_info_t),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info,
+	                          mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub,
+	                          lsub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub,
+	                          usub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp;
+	A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(float)
+	                   + bigu_send_size * sizeof(float)
+	                   + RemainBlk * sizeof(Remain_info_t)
+	                   + mcb * sizeof(Ublock_info_t)
+	                   + lsub_len * sizeof(int_t)
+	                   + usub_len * sizeof(int_t);
+
+	float alpha = 1.0, beta = 0.0;
+
+	int_t ii_st  = 0;
+	int_t ii_end = 0;
+	int_t maxGemmBlockDim = (int) sqrt(buffer_size);
+	// int_t maxGemmBlockDim = 8000;
+
+	/* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that
+	   fits in the buffer_size  */
+	while (ii_end < RemainBlk) {
+    	    ii_st = ii_end;
+	    ii_end = RemainBlk;
+	    int_t nrow_max = maxGemmBlockDim;
+// nrow_max = Rnbrow;
+	    int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow;
+	    nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows,  nrow_max) : nrow_max;
+
+	    int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow;
+	    for (int_t i = ii_st; i < RemainBlk - 1; ++i)
+    	    {
+		if ( Remain_info[i + 1].FullRow > ResRow + nrow_max)
+		{
+		    ii_end = i;
+		    break;  /* row dimension reaches nrow_max */
+		}
+	    }
+
+	    int_t nrows;   /* actual row dimension for GEMM */
+	    int_t st_row;
+	    if (ii_st > 0)
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow;
+		st_row = Remain_info[ii_st - 1].FullRow;
+	    }
+	    else
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow;
+		st_row = 0;
+	    }
+
+	    int jj_st = jj_cpu;
+	    int jj_end = jj_cpu;
+
+	    while (jj_end < nub && nrows > 0 )
+	    {
+		int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols;
+		if ( remaining_cols * nrows < buffer_size)
+		{
+			jj_st = jj_end;
+			jj_end = nub;
+		}
+		else  /* C matrix cannot fit in buffer, need to break into pieces */
+		{
+		    int_t ncol_max = buffer_size / nrows;
+		    /** Must revisit **/
+		    ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim);
+		    ncol_max = (remaining_cols / ncol_max) > 0 ?
+		           remaining_cols / CEILING(remaining_cols,  ncol_max)
+		           : ncol_max;
+
+		    jj_st = jj_end;
+		    jj_end = nub;
+
+		    int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols;
+		    for (int_t j = jj_st; j < nub - 1; ++j)
+		    {
+			if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max)
+			{
+				jj_end = j;
+				break;
+			}
+		    }
+	    	} /* end-if-else */
+
+		int ncols;
+		int st_col;
+		if (jj_st > 0)
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols;
+		    st_col = Ublock_info[jj_st - 1].full_u_cols;
+		    if (ncols == 0) exit(0);
+		}
+		else
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* none of the matrix dimension is zero. */
+		if (nrows > 0 && ldu > 0 && ncols > 0)
+		{
+		    if (nrows * ncols > buffer_size) {
+			printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n",
+			       nrows, ncols, buffer_size);
+			fflush(stdout);
+		    }
+		    assert(nrows * ncols <= buffer_size);
+		    gpublasSetStream(gpublas_handle0, FunCallStream);
+		    gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream);
+		    gpublasSgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N,
+		            nrows, ncols, ldu, &alpha,
+		            &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow,
+		            &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu,
+		            &beta, A_gpu->scubufs[streamId].bigV, nrows);
+
+// #define SCATTER_OPT
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchronous
+#endif
+		    gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream);
+
+		    A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu;
+
+		    /*
+		     * Scattering the output
+		     */
+		     // dim3 dimBlock(THREAD_BLOCK_SIZE);   // 1d thread
+		    dim3 dimBlock(ldt);   // 1d thread
+
+		    dim3 dimGrid(ii_end - ii_st, jj_end - jj_st);
+
+		    Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>>
+			(streamId, ii_st, ii_end,  jj_st, jj_end, klst,
+			 0, nrows, ldt, npcol, nprow, dA_gpu);
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchrnous
+#endif
+
+		    gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream);
+
+		    A_gpu->ScatterMOPCounter +=  3.0 * (double) nrows * ncols;
+		} /* endif ... none of the matrix dimension is zero. */
+
+	    } /* end while jj_end < nub */
+
+	} /* end while (ii_end < RemainBlk) */
+
+	return 0;
+} /* end sSchurCompUpdate_GPU */
+
+
+static void print_occupancy()
+{
+    int blockSize;   // The launch configurator returned block size
+    int minGridSize; /* The minimum grid size needed to achieve the
+    		        best potential occupancy  */
+
+    gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize,
+                                        Scatter_GPU_kernel, 0, 0);
+    printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize);
+}
+
+static void printDevProp(gpuDeviceProp devProp)
+{
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+	
+	printf("pciBusID:                      %d\n",  devProp.pciBusID);
+	printf("pciDeviceID:                   %d\n",  devProp.pciDeviceID);
+	printf("GPU Name:                      %s\n",  devProp.name);
+	printf("Total global memory:           %zu\n",  devProp.totalGlobalMem);
+	printf("Total free memory:             %zu\n",  mfree);
+	printf("Clock rate:                    %d\n",  devProp.clockRate);
+
+	return;
+}
+
+
+static size_t get_acc_memory ()
+{
+
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+#if 0
+	printf("Total memory %zu & free memory %zu\n", mtotal, mfree);
+#endif
+	return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu ();
+
+
+}
+
+int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
+{
+	/* Free the L data structure on GPU */
+	checkGPU(gpuFree(A_gpu->LrowindVec));
+	checkGPU(gpuFree(A_gpu->LrowindPtr));
+
+	checkGPU(gpuFree(A_gpu->LnzvalVec));
+	checkGPU(gpuFree(A_gpu->LnzvalPtr));
+	free(A_gpu->LnzvalPtr_host);
+	
+	/*freeing the pinned memory*/
+	int_t streamId = 0;
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host));
+
+	checkGPU(gpuFreeHost(A_gpu->acc_L_buff));
+	checkGPU(gpuFreeHost(A_gpu->acc_U_buff));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf));
+
+
+	SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE
+	SUPERLU_FREE(A_gpu->GemmStart);
+	SUPERLU_FREE(A_gpu->GemmEnd);
+	SUPERLU_FREE(A_gpu->ScatterEnd);
+	SUPERLU_FREE(A_gpu->ePCIeH2D);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_Start);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_End);
+
+	/* Free the U data structure on GPU */
+	checkGPU(gpuFree(A_gpu->UrowindVec));
+	checkGPU(gpuFree(A_gpu->UrowindPtr));
+
+	//free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated
+
+	checkGPU(gpuFree(A_gpu->UnzvalVec));
+	checkGPU(gpuFree(A_gpu->UnzvalPtr));
+
+	checkGPU(gpuFree(A_gpu->grid));
+
+	/* Free the Schur complement structure on GPU */
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info));
+
+	// checkGPU(gpuFree(A_gpu->indirect));
+	// checkGPU(gpuFree(A_gpu->indirect2));
+	checkGPU(gpuFree(A_gpu->xsup));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].usub));
+
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->jib_lookupVec));
+	checkGPU(gpuFree(A_gpu->jib_lookupPtr));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->ijb_lookupVec));
+	checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
+
+	return 0;
+}
+
+
+
+void sPrint_matrix( char *desc, int_t m, int_t n, float * dA, int_t lda )
+{
+	float *cPtr = (float *) malloc(sizeof(float) * lda * n);
+	checkGPU(gpuMemcpy( cPtr, dA,
+	                      lda * n * sizeof(float), gpuMemcpyDeviceToHost)) ;
+
+	int_t i, j;
+	printf( "\n %s\n", desc );
+	for ( i = 0; i < m; i++ )
+	{
+		for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] );
+		printf( "\n" );
+	}
+	free(cPtr);
+}
+
+void sprintGPUStats(sLUstruct_gpu_t * A_gpu)
+{
+	double tGemm = 0;
+	double tScatter = 0;
+	double tPCIeH2D = 0;
+	double tPCIeD2H = 0;
+
+	for (int_t i = 0; i < A_gpu->nsupers; ++i)
+	{
+	    float milliseconds = 0;
+
+	    if (A_gpu->isOffloaded[i])
+		{
+			gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]);
+			tPCIeH2D += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]);
+			tGemm += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]);
+			tScatter += 1e-3 * (double) milliseconds;
+		}
+
+		milliseconds = 0;
+		gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]);
+		tPCIeD2H += 1e-3 * (double) milliseconds;
+	}
+
+	printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n",
+	       A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm  );
+	printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n",
+	       A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter  );
+	printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D  );
+	printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H  );
+	fflush(stdout);
+
+} /* end printGPUStats */
+
+/* Initialize the GPU side of the data structure. */
+int sinitSluGPU3D_t(
+    ssluGPU_t *sluGPU, // LU structures on GPU, see slustruct_gpu.h 
+    sLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno,
+    int_t n,
+    int_t buffer_size,    /* read from env variable MAX_BUFFER_SIZE */
+    int_t bigu_size,
+    int_t ldt             /* NSUP read from sp_ienv(3) */
+)
+{
+    checkGPUErrors(gpuDeviceReset ())     ;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int* isNodeInMyGrid = sluGPU->isNodeInMyGrid;
+
+    sluGPU->nGPUStreams = getnGPUStreams();
+    
+    int SCATTER_THREAD_BLOCK_SIZE = ldt; 
+    if(getenv("SCATTER_THREAD_BLOCK_SIZE"))
+    {
+	int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE"));
+	if(stbs>=ldt)
+	{
+	    SCATTER_THREAD_BLOCK_SIZE = stbs; 
+	}
+	
+    }
+    
+    if (grid3d->iam == 0)
+    {
+	printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams);
+	fflush(stdout);
+	printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE);
+	
+	if ( MAX_SUPER_SIZE < ldt )
+	{
+		ABORT("MAX_SUPER_SIZE smaller than requested NSUP");
+	}
+    }
+
+    gpuStreamCreate(&(sluGPU->CopyStream));
+
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+	gpuStreamCreate(&(sluGPU->funCallStreams[streamId]));
+	gpublasCreate(&(sluGPU->gpublasHandles[streamId]));
+	sluGPU->lastOffloadStream[streamId] = -1;
+    }
+
+    sluGPU->A_gpu = (sLUstruct_gpu_t *) malloc (sizeof(sLUstruct_gpu_t));
+    sluGPU->A_gpu->perm_c_supno = perm_c_supno;
+
+    /* Allocate GPU memory for the LU data structures, and copy
+       the host LU structure to GPU side.  */
+    sCopyLUToGPU3D ( isNodeInMyGrid,
+	        Llu,             /* referred to as A_host */
+	        sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt
+	);
+
+    return 0;
+} /* end sinitSluGPU3D_t */
+
+
+int sinitD2Hreduce(
+    int next_k,  d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP,
+    ssluGPU_t *sluGPU, gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t* SCT
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+
+
+    // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1);
+    // int_t next_k = perm_c_supno[next_col];  /* global block number for next colum*/
+    int_t mkcol, mkrow;
+    
+    int_t kljb = LBj( next_k, grid );   /*local block number for next block*/
+    int_t kijb = LBi( next_k, grid );   /*local block number for next block*/
+    
+    int_t *kindexL ;                     /*for storing index vectors*/
+    int_t *kindexU ;
+    mkrow = PROW (next_k, grid);
+    mkcol = PCOL (next_k, grid);
+    int_t ksup_size = SuperSize(next_k);
+    
+    int_t copyL_kljb = 0;
+    int_t copyU_kljb = 0;
+    int_t l_copy_len = 0;
+    int_t u_copy_len = 0;
+    
+    if (mkcol == mycol &&  Lrowind_bc_ptr[kljb] != NULL  && last_flag)
+    {
+	if (HyP->Lblock_dirty_bit[kljb] > -1)
+	    {
+		copyL_kljb = 1;
+		int_t lastk0 = HyP->Lblock_dirty_bit[kljb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+
+	kindexL = Lrowind_bc_ptr[kljb];
+	l_copy_len = kindexL[1] * ksup_size;
+    }
+
+    if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL    && last_flag )
+    {
+	if (HyP->Ublock_dirty_bit[kijb] > -1)
+	    {
+		copyU_kljb = 1;
+		int_t lastk0 = HyP->Ublock_dirty_bit[kijb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+	// copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0;
+	kindexU = Ufstnz_br_ptr[kijb];
+	u_copy_len = kindexU[1];
+    }
+
+    // wait for streams if they have not been finished
+    
+    // d2Hred->next_col = next_col;
+    d2Hred->next_k = next_k;
+    d2Hred->kljb = kljb;
+    d2Hred->kijb = kijb;
+    d2Hred->copyL_kljb = copyL_kljb;
+    d2Hred->copyU_kljb = copyU_kljb;
+    d2Hred->l_copy_len = l_copy_len;
+    d2Hred->u_copy_len = u_copy_len;
+    d2Hred->kindexU = kindexU;
+    d2Hred->kindexL = kindexL;
+    d2Hred->mkrow = mkrow;
+    d2Hred->mkcol = mkcol;
+    d2Hred->ksup_size = ksup_size;
+    return 0;
+} /* sinitD2Hreduce */
+
+int sreduceGPUlu(
+    int last_flag,
+    d2Hreduce_t* d2Hred,
+    ssluGPU_t *sluGPU,
+    SCT_t *SCT,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct
+)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    gpuStream_t CopyStream;
+    sLUstruct_gpu_t *A_gpu;
+    A_gpu = sluGPU->A_gpu;
+    CopyStream = sluGPU->CopyStream;
+
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t mkrow = d2Hred->mkrow;
+    int_t mkcol = d2Hred->mkcol;
+    int_t ksup_size = d2Hred->ksup_size;
+    int_t *kindex;
+    if ((copyL_kljb || copyU_kljb) && last_flag )
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(CopyStream);
+	    SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx;
+	}
+
+    double tt_start = SuperLU_timer_();
+
+    if (last_flag) {
+	if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL )
+	    {
+		kindex = Lrowind_bc_ptr[kljb];
+		int_t len = kindex[1];
+
+		if (copyL_kljb)
+		    {
+			float *nzval_host;
+			nzval_host = Lnzval_bc_ptr[kljb];
+			int_t llen = ksup_size * len;
+			float alpha = 1;
+			superlu_saxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1);
+		    }
+
+	    }
+    }
+    if (last_flag) {
+	if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL )
+	    {
+		kindex = Ufstnz_br_ptr[kijb];
+		int_t len = kindex[1];
+
+		if (copyU_kljb)
+		    {
+			float *nzval_host;
+			nzval_host = Unzval_br_ptr[kijb];
+
+			float alpha = 1;
+			superlu_saxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1);
+		    }
+	    }
+    }
+
+    double tt_end = SuperLU_timer_();
+    SCT->AssemblyTimer += tt_end - tt_start;
+    return 0;
+} /* sreduceGPUlu */
+
+
+int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT)
+{
+    double ttx = SuperLU_timer_();
+    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+    return 0;
+}
+
+int ssendLUpanelGPU2HOST(
+    int_t k0,
+    d2Hreduce_t* d2Hred,
+    ssluGPU_t *sluGPU
+)
+{
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t l_copy_len = d2Hred->l_copy_len;
+    int_t u_copy_len = d2Hred->u_copy_len;
+    gpuStream_t CopyStream = sluGPU->CopyStream;;
+    sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    double tty = SuperLU_timer_();
+    gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream);
+    if (copyL_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]],
+				  l_copy_len * sizeof(float), gpuMemcpyDeviceToHost, CopyStream ) );
+
+    if (copyU_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]],
+				  u_copy_len * sizeof(float), gpuMemcpyDeviceToHost, CopyStream ) );
+    gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream);
+    A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty;
+    A_gpu->cPCIeD2H += u_copy_len * sizeof(float) + l_copy_len * sizeof(float);
+
+    return 0;
+}
+
+/* Copy L and U panel data structures from host to the host part of the
+   data structures in A_gpu.
+   GPU is not involved in this routine. */
+int ssendSCUdataHost2GPU(
+    int_t streamId,
+    int_t* lsub,
+    int_t* usub,
+    float* bigU,
+    int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,
+    ssluGPU_t *sluGPU,
+    HyP_t* HyP
+)
+{
+    //{printf("....[enter] ssendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+    int_t usub_len = usub[2];
+    int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+    //{printf("....[2] in ssendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);}
+    sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len);
+    memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len);
+    memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info,
+	   sizeof(Remain_info_t)*HyP->RemainBlk);
+    memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi,
+	   sizeof(Ublock_info_t)*HyP->num_u_blks_Phi);
+    memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff,
+	   sizeof(float)*Remain_lbuf_send_size);
+    memcpy(A_gpu->scubufs[streamId].bigU_host, bigU,
+	   sizeof(float)*bigu_send_size);
+
+    return 0;
+}
+
+/* Sherry: not used ?*/
+#if 0
+int freeSluGPU(ssluGPU_t *sluGPU)
+{
+    return 0;
+}
+#endif
+
+/* Allocate GPU memory for the LU data structures, and copy
+   the host LU structure to GPU side.
+   After factorization, the GPU LU structure should be freed by
+   calling sfree_LUsstruct_gpu().    */
+void sCopyLUToGPU3D (
+    int* isNodeInMyGrid,
+    sLocalLU_t *A_host, /* distributed LU structure on host */
+    ssluGPU_t *sluGPU,  /* hold LU structure on GPU */
+    Glu_persist_t *Glu_persist, int_t n,
+    gridinfo3d_t *grid3d,
+    int_t buffer_size, /* bigV size on GPU for Schur complement update */
+    int_t bigu_size,
+    int_t ldt
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLUstruct_gpu_t * A_gpu =  sluGPU->A_gpu;
+    sLUstruct_gpu_t **dA_gpu =  &(sluGPU->dA_gpu);
+
+#if ( PRNTlevel>=1 )
+    if ( grid3d->iam == 0 ) print_occupancy();
+#endif
+
+#ifdef GPU_DEBUG
+    // if ( grid3d->iam == 0 )
+    {
+	gpuDeviceProp devProp;
+	gpuGetDeviceProperties(&devProp, 0);
+	printDevProp(devProp);
+    }
+#endif
+    int_t *xsup ;
+    xsup = Glu_persist->xsup;
+    int iam = grid->iam;
+    int nsupers = Glu_persist->supno[n - 1] + 1;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    int_t remain_l_max = A_host->bufmax[1];
+
+    /*copies of scalars for easy access*/
+    A_gpu->nsupers = nsupers;
+    A_gpu->ScatterMOPCounter = 0;
+    A_gpu->GemmFLOPCounter = 0;
+    A_gpu->cPCIeH2D = 0;
+    A_gpu->cPCIeD2H = 0;
+    A_gpu->tHost_PCIeH2D = 0;
+    A_gpu->tHost_PCIeD2H = 0;
+
+    /*initializing memory*/
+    size_t max_gpu_memory = get_acc_memory ();
+    size_t gpu_mem_used = 0;
+
+    void *tmp_ptr;
+
+    A_gpu->xsup_host = xsup;
+
+    int_t nGPUStreams = sluGPU->nGPUStreams;
+    /*pinned memory allocations.
+      Paged-locked memory by gpuMallocHost is accessible to the device.*/
+    for (int streamId = 0; streamId < nGPUStreams; streamId++ ) {
+	void *tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, (n) * sizeof(int_t) )) ;
+	A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  ( n) * sizeof(int_t) ));
+	A_gpu->scubufs[streamId].usub_IndirectJ3 =  (int_t*) tmp_ptr;
+	gpu_mem_used += ( n) * sizeof(int_t);
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  remain_l_max * sizeof(float) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff_host = (float *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  bigu_size * sizeof(float) )) ;
+	A_gpu->scubufs[streamId].bigU_host = (float *) tmp_ptr;
+
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[1])));
+	A_gpu->acc_L_buff = (float *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[3])));
+	A_gpu->acc_U_buff = (float *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])));
+	A_gpu->scubufs[streamId].lsub_buf =  (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])));
+	A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  remain_l_max * sizeof(float) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff = (float *) tmp_ptr;
+	gpu_mem_used += remain_l_max * sizeof(float);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  bigu_size * sizeof(float) )) ;
+	A_gpu->scubufs[streamId].bigU = (float *) tmp_ptr;
+	gpu_mem_used += bigu_size * sizeof(float);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr;
+	gpu_mem_used += mcb * sizeof(Ublock_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr;
+	gpu_mem_used += mrb * sizeof(Remain_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  buffer_size * sizeof(float))) ;
+	A_gpu->scubufs[streamId].bigV = (float *) tmp_ptr;
+	gpu_mem_used += buffer_size * sizeof(float);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[0]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[0] * sizeof(int_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[2]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[2] * sizeof(int_t);
+	
+    } /* endfor streamID ... allocate paged-locked memory */
+
+    A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers);
+    A_gpu->GemmStart  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->GemmEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ScatterEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    
+    for (int i = 0; i < nsupers; ++i)
+	{
+	    A_gpu->isOffloaded[i] = 0;
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i])));
+	}
+
+    /*---- Copy L data structure to GPU ----*/
+
+    /*pointers and address of local blocks for easy accessibility */
+    local_l_blk_info_t  *local_l_blk_infoVec;
+    int_t  * local_l_blk_infoPtr;
+    local_l_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) );
+
+    /* First pass: count total L blocks */
+    int_t cum_num_l_blocks = 0;  /* total number of L blocks I own */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    /* going through each block column I own */
+
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0];
+		    cum_num_l_blocks += num_l_blocks;
+		}
+	}
+
+    /*allocating memory*/
+    local_l_blk_infoVec =  (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t));
+
+    /* Second pass: set up the meta-data for the L structure */
+    cum_num_l_blocks = 0;
+
+    /*initialzing vectors */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0]; /* # L blocks in this column */
+
+		    if (num_l_blocks > 0)
+			{
+
+			    local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks;
+			    local_l_blk_infoPtr[i] = cum_num_l_blocks;
+
+			    int_t lptrj = BC_HEADER;
+			    int_t luptrj = 0;
+
+			    for (int_t j = 0; j < num_l_blocks ; ++j)
+				{
+
+				    int_t ijb = index[lptrj];
+
+				    local_l_blk_info_i[j].lib = ijb / Pr;
+				    local_l_blk_info_i[j].lptrj = lptrj;
+				    local_l_blk_info_i[j].luptrj = luptrj;
+				    luptrj += index[lptrj + 1];
+				    lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+					
+				}
+			}
+		    cum_num_l_blocks += num_l_blocks;
+		}
+
+	} /* endfor all block columns */
+
+    /* Allocate L memory on GPU, and copy the values from CPU to GPU */
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  cum_num_l_blocks * sizeof(local_l_blk_info_t))) ;
+    A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr;
+    gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  CEILING(nsupers, Pc)*sizeof(int_t))) ;
+    A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr;
+    gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+    /*---- Copy U data structure to GPU ----*/
+
+    local_u_blk_info_t  *local_u_blk_infoVec;
+    int_t  * local_u_blk_infoPtr;
+    local_u_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) );
+
+    /* First pass: count total U blocks */
+    int_t cum_num_u_blocks = 0;
+
+    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+		    cum_num_u_blocks += num_u_blocks;
+
+		}
+	}
+
+	local_u_blk_infoVec =  (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t));
+
+	/* Second pass: set up the meta-data for the U structure */
+	cum_num_u_blocks = 0;
+
+	for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+
+		    if (num_u_blocks > 0)
+			{
+			    local_u_blk_info_t  *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks;
+			    local_u_blk_infoPtr[i] = cum_num_u_blocks;
+
+			    int_t iuip_lib, ruip_lib;
+			    iuip_lib = BR_HEADER;
+			    ruip_lib = 0;
+
+			    for (int_t j = 0; j < num_u_blocks ; ++j)
+				{
+
+				    int_t ijb = index[iuip_lib];
+				    local_u_blk_info_i[j].ljb = ijb / Pc;
+				    local_u_blk_info_i[j].iuip = iuip_lib;
+				    local_u_blk_info_i[j].ruip = ruip_lib;
+
+				    ruip_lib += index[iuip_lib + 1];
+				    iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+
+				}
+			}
+		    cum_num_u_blocks +=  num_u_blocks;
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  cum_num_u_blocks * sizeof(local_u_blk_info_t))) ;
+	A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr;
+	gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  CEILING(nsupers, Pr)*sizeof(int_t))) ;
+	A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr;
+	gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	/* Copy the actual L indices and values */
+	int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */
+	int_t *temp_LrowindPtr    = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *temp_LnzvalPtr     = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t l_ind_len = 0;
+	int_t l_val_len = 0;
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    temp_LrowindPtr[ljb] = l_ind_len;
+		    temp_LnzvalPtr[ljb] = l_val_len;        // ###
+		    Lnzval_size[ljb] = 0;       //###
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  = index_host[0];   /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+			    /* Global block number is mycol +  ljb*Pc */
+			    int_t nsupc = SuperSize(jb);
+
+			    l_ind_len += len1;
+			    l_val_len += len * nsupc;
+			    Lnzval_size[ljb] = len * nsupc ; // ###
+			}
+		    else
+			{
+			    Lnzval_size[ljb] = 0 ; // ###
+			}
+		}
+	} /* endfor jb = 0 ... */
+
+	/* Copy the actual U indices and values */
+	int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	int_t *temp_UrowindPtr    = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *temp_UnzvalPtr     = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t u_ind_len = 0;
+	int_t u_val_len = 0;
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+	    temp_UrowindPtr[lb] = u_ind_len;
+	    temp_UnzvalPtr[lb] = u_val_len;
+	    Unzval_size[lb] = 0;
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len = index_host[1];
+		    int_t len1 = index_host[2];
+		    
+		    u_ind_len += len1;
+		    u_val_len += len;
+		    Unzval_size[lb] = len;
+		}
+	    else
+		{
+		    Unzval_size[lb] = 0;
+		}
+	}
+
+	gpu_mem_used += l_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * l_k * sizeof(int_t);
+	gpu_mem_used += u_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * u_k * sizeof(int_t);
+
+	/*left memory shall be divided among the two */
+
+	for (int_t i = 0;  i < l_k; ++i)
+	{
+	    temp_LnzvalPtr[i] = -1;
+	}
+
+	for (int_t i = 0; i < u_k; ++i)
+	{
+	    temp_UnzvalPtr[i] = -1;
+	}
+
+	/*setting these pointers back */
+	l_val_len = 0;
+	u_val_len = 0;
+
+	int_t num_gpu_l_blocks = 0;
+	int_t num_gpu_u_blocks = 0;
+	size_t mem_l_block, mem_u_block;
+
+	/* Find the trailing matrix size that can fit into GPU memory */
+	for (int_t i = nsupers - 1; i > -1; --i)
+	{
+	    /* ulte se chalte hai eleimination tree  */
+	    /* bottom up ordering  */
+	    int_t i_sup = A_gpu->perm_c_supno[i];
+
+	    int_t pc = PCOL( i_sup, grid );
+	    if (isNodeInMyGrid[i_sup] == 1)
+		{
+		    if (mycol == pc )
+			{
+			    int_t ljb  = LBj(i_sup, grid);
+			    mem_l_block = sizeof(float) * Lnzval_size[ljb];
+			    if (gpu_mem_used + mem_l_block > max_gpu_memory)
+				{
+				    break;
+				}
+				else
+				{
+				    gpu_mem_used += mem_l_block;
+				    temp_LnzvalPtr[ljb] = l_val_len;
+				    l_val_len += Lnzval_size[ljb];
+				    num_gpu_l_blocks++;
+				    A_gpu->first_l_block_gpu = i;
+				}
+			}
+
+			int_t pr = PROW( i_sup, grid );
+			if (myrow == pr)
+			{
+			    int_t lib  = LBi(i_sup, grid);
+			    mem_u_block = sizeof(float) * Unzval_size[lib];
+			    if (gpu_mem_used + mem_u_block > max_gpu_memory)
+				{
+				    break;
+				}
+			    else
+				{
+				    gpu_mem_used += mem_u_block;
+				    temp_UnzvalPtr[lib] = u_val_len;
+				    u_val_len += Unzval_size[lib];
+				    num_gpu_u_blocks++;
+				    A_gpu->first_u_block_gpu = i;
+				}
+			}
+		} /* endif */
+
+	} /* endfor i .... nsupers */
+
+#if (PRNTlevel>=2)
+	printf("(%d) Number of L blocks in GPU %d, U blocks %d\n",
+	       grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks );
+	printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n",
+	       grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu);
+	printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam,
+	       l_val_len * sizeof(float) * 1e-9,
+	       u_val_len * sizeof(float) * 1e-9,
+	       gpu_mem_used * 1e-9, max_gpu_memory * 1e-9);
+	fflush(stdout);
+#endif
+
+	/* Assemble index vector on temp */
+	int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len);
+	for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  =   index_host[0]; /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    
+			    memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ;
+			}
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  l_ind_len * sizeof(int_t))) ;
+	A_gpu->LrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_val_len * sizeof(float)));
+	A_gpu->LnzvalVec = (float *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(float)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->LnzvalPtr_host = temp_LnzvalPtr;
+
+	int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len);
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len1 = index_host[2];
+		    memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1);
+		}
+	}
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_ind_len * sizeof(int_t))) ;
+	A_gpu->UrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_val_len * sizeof(float)));
+	A_gpu->UnzvalVec = (float *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(float)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->UnzvalPtr_host = temp_UnzvalPtr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  (nsupers + 1)*sizeof(int_t))) ;
+	A_gpu->xsup = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  sizeof(sLUstruct_gpu_t))) ;
+	*dA_gpu = (sLUstruct_gpu_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(sLUstruct_gpu_t), gpuMemcpyHostToDevice)) ;
+
+	free (temp_LrowindPtr);
+	free (temp_UrowindPtr);
+	free (indtemp1);
+	free (indtemp);
+
+} /* end sCopyLUToGPU3D */
+
+
+
+int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+				int_t** treePerm,
+				sLUValSubBuf_t*LUvsb,
+				   sLUstruct_t* LUstruct,
+			           gridinfo3d_t* grid3d,
+				   ssluGPU_t *sluGPU,
+				   d2Hreduce_t* d2Hred,
+				   factStat_t *factStat,
+				   HyP_t* HyP, SCT_t* SCT )
+{
+    // first synchronize all gpu streams
+    int superlu_acc_offload =   HyP->superlu_acc_offload;
+
+    int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t* gpuLUreduced = factStat->gpuLUreduced;
+
+    int_t sender;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    
+	}
+    else
+	{
+	    sender = myGrid;
+	}
+
+    /*Reduce all the ancestors from the GPU*/
+    if (myGrid == sender && superlu_acc_offload)
+    {
+        for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+	    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+	    sluGPU->lastOffloadStream[streamId] = -1;
+	}
+
+	for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+
+	    for (int_t node = 0; node < nsAncestor; node++ )
+	    {
+	        int_t k = cAncestorList[node];
+	        if (!gpuLUreduced[k])
+		{
+		    sinitD2Hreduce(k, d2Hred, 1,
+				  HyP, sluGPU, grid, LUstruct, SCT);
+		    int_t copyL_kljb = d2Hred->copyL_kljb;
+		    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+		    double tt_start1 = SuperLU_timer_();
+		    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+		    if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+		    ssendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+		    /*
+		      Reduce the LU panels from GPU
+		    */
+		    sreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct);
+		    gpuLUreduced[k] = 1;
+		}
+	    }
+	}
+    } /*if (myGrid == sender)*/
+
+    sreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+	                      LUvsb, LUstruct, grid3d, SCT );
+    return 0;
+} /* sreduceAllAncestors3d_GPU */
+
+
+void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT)
+{
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+        double ttx = SuperLU_timer_();
+        gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+        SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+        sluGPU->lastOffloadStream[streamId] = -1;
+     }
+}
diff --git a/SRC/ssuperlu_gpu.hip.cpp b/SRC/ssuperlu_gpu.hip.cpp
new file mode 100644
index 00000000..e474ab12
--- /dev/null
+++ b/SRC/ssuperlu_gpu.hip.cpp
@@ -0,0 +1 @@
+#include "ssuperlu_gpu.cu"
\ No newline at end of file
diff --git a/SRC/streeFactorization.c b/SRC/streeFactorization.c
new file mode 100644
index 00000000..5ad8adc7
--- /dev/null
+++ b/SRC/streeFactorization.c
@@ -0,0 +1,763 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t sLluBufInit(sLUValSubBuf_t* LUvsb, sLUstruct_t *LUstruct)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = floatMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = floatMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+sdiagFactBufs_t** sinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    sdiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (sdiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(sdiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (sdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(sdiagFactBufs_t));
+        assert(dFBufs[i]);
+        sinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int sfreeDiagFactBufsArr(int_t mxLeafNode, sdiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+sLUValSubBuf_t** sLluBufInitArr(int_t numLA, sLUstruct_t *LUstruct)
+{
+    sLUValSubBuf_t** LUvsbs = (sLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(sLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (sLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+        sLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int sLluBufFreeArr(int_t numLA, sLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t sinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  sscuBufs_t* scuBufs,
+                  sLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = sgetBigV(ldt, num_threads);
+    scuBufs->bigU = sgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int sfreeScuBufs(sscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t sinitDiagFactBufs(int_t ldt, sdiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t sdenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    sscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    sLUValSubBuf_t* LUvsb,
+    sdiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	sIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	sUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	sIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	sWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	sWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= sSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            float* bigV = scuBufs->bigV;
+            float* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                float** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                float *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                float *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                sblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* sdenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t ssparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    sscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    sLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    sdiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter ssparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = sSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            float* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    sLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    sUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit ssparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* ssparseTreeFactor_ASYNC */
diff --git a/SRC/streeFactorizationGPU.c b/SRC/streeFactorizationGPU.c
new file mode 100644
index 00000000..b136ebbf
--- /dev/null
+++ b/SRC/streeFactorizationGPU.c
@@ -0,0 +1,759 @@
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * </pre>
+ */
+// #include "treeFactorization.h"
+// #include "trfCommWrapper.h"
+#include "slustruct_gpu.h"
+
+//#include "cblas.h"
+
+#ifdef GPU_ACC ///////////////// enable GPU
+
+/* 
+/-- num_u_blks--\ /-- num_u_blks_Phi --\
+----------------------------------------
+|  host_cols    ||    GPU   |   host   |
+----------------------------------------
+                  ^          ^
+                  0          jj_cpu
+*/
+#if 0
+static int_t getAccUPartition(HyP_t *HyP)
+{
+    /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
+    int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                        estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu);
+
+    int jj_cpu;
+
+#if 0 /* Ignoe those estimates */
+    jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
+                                   HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
+#else /* Sherry: new */
+    jj_cpu = HyP->num_u_blks_Phi;
+#endif
+
+    if (jj_cpu != 0 && HyP->Rnbrow > 0) // ###
+    {
+        HyP->offloadCondition = 1;
+    }
+    else
+    {
+        HyP->offloadCondition = 0;
+        jj_cpu = 0; // ###
+    }
+
+    return jj_cpu;
+}
+#endif
+
+int ssparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    sscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    sLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    sdiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    ssluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info)
+{
+    // sforest.nNodes, sforest.nodeList,
+    // &sforest.topoInfo,
+    int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
+    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
+    int_t *myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t *grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t *IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t *factored_D = factStat->factored_D;
+    int_t *factored_L = factStat->factored_L;
+    int_t *factored_U = factStat->factored_U;
+    int_t *IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t *IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t *gpuLUreduced = factStat->gpuLUreduced;
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+
+    // int_t numLAMax = getNumLookAhead();
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax; // number of look-ahead panels
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
+    int_t last_flag = 1;                       /* for updating nsuper-1 only once */
+    int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams
+
+    if (superlu_acc_offload)
+        ssyncAllfunCallStreams(sluGPU, SCT);
+
+    /* Go through each leaf node */
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+
+        /* If LU panels from GPU are not reduced, then reduce
+	   them before diagonal factorization */
+        if (!gpuLUreduced[k] && superlu_acc_offload)
+        {
+            double tt_start1 = SuperLU_timer_();
+
+            sinitD2Hreduce(k, d2Hred, last_flag,
+                          HyP, sluGPU, grid, LUstruct, SCT);
+            int_t copyL_kljb = d2Hred->copyL_kljb;
+            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+            if (copyL_kljb || copyU_kljb)
+                SCT->PhiMemCpyCounter++;
+            ssendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+
+            sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct);
+
+            gpuLUreduced[k] = 1;
+            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+        }
+
+        double t1 = SuperLU_timer_();
+
+        /*Now factor and broadcast diagonal block*/
+        // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+        //                 options, thresh, LUstruct, stat, info, SCT);
+
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+        sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                        factStat->IrecvPlcd_D,
+                        comReqss[offset]->U_diag_blk_recv_req,
+                        comReqss[offset]->L_diag_blk_recv_req,
+                        comReqss[offset]->U_diag_blk_send_req,
+                        comReqss[offset]->L_diag_blk_send_req,
+                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+
+        SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+    } /* for all leaves ... */
+
+    //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
+
+    /* Process supernodal etree level by level */
+    for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    {
+        //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        /* code */
+        int k_st = eTreeTopLims[topoLvl];
+        int k_end = eTreeTopLims[topoLvl + 1];
+
+        /* Process all the nodes in 'topoLvl': diagonal factorization */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            if (!factored_D[k])
+            {
+                /*If LU panels from GPU are not reduced then reduce
+		  them before diagonal factorization*/
+                if (!gpuLUreduced[k] && superlu_acc_offload)
+                {
+                    double tt_start1 = SuperLU_timer_();
+                    sinitD2Hreduce(k, d2Hred, last_flag,
+                                     HyP, sluGPU, grid, LUstruct, SCT);
+                    int_t copyL_kljb = d2Hred->copyL_kljb;
+                    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                    if (copyL_kljb || copyU_kljb)
+                        SCT->PhiMemCpyCounter++;
+                    ssendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+                    /*
+                        Reduce the LU panels from GPU
+                    */
+                    sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid,
+		    		     LUstruct);
+
+                    gpuLUreduced[k] = 1;
+                    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                }
+
+                double t1 = SuperLU_timer_();
+                /* Factor diagonal block on CPU */
+                // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                //                 options, thresh, LUstruct, stat, info, SCT);
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+                sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                                factStat->IrecvPlcd_D,
+                                comReqss[offset]->U_diag_blk_recv_req,
+                                comReqss[offset]->L_diag_blk_recv_req,
+                                comReqss[offset]->U_diag_blk_send_req,
+                                comReqss[offset]->L_diag_blk_send_req,
+                                grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+            }
+        } /* for all nodes in this level */
+
+        //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+
+        double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
+
+        /* Process all the nodes in 'topoLvl': panel updates on CPU */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+                sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+                              comReqss[offset]->U_diag_blk_recv_req,
+                              dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+                sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+                              dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+                              packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        } /* end panel update */
+
+        //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+
+        /* Process all the panels in look-ahead window: 
+	   broadcast L and U panels. */
+        for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+            {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+                                  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+                                  LUvsbs[offset]->Lval_buf, factStat->factored,
+                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                                  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+                                  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        } /* end for panels in look-ahead window */
+
+        //printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
+
+        /* Process all the nodes in level 'topoLvl': Schur complement update
+	   (no MPI communication)  */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+
+            double tsch = SuperLU_timer_();
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+            sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU,
+                   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                   grid, LUstruct, SCT);
+            sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+
+            int_t LU_nonempty = sSchurComplementSetupGPU(k,
+                                                         msgss[offset], packLUInfo,
+                                                         myIperm, gIperm_c_supno, perm_c_supno,
+                                                         gEtreeInfo, fNlists, scuBufs,
+                                                         LUvsbs[offset], grid, LUstruct, HyP);
+            // initializing D2H data transfer. D2H = Device To Host.
+            int_t jj_cpu; /* limit between CPU and GPU */
+
+#if 1
+            if (superlu_acc_offload)
+            {
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                /* code */
+                HyP->offloadCondition = 0;
+                jj_cpu = 0;
+            }
+
+#else
+            if (superlu_acc_offload)
+            {
+                jj_cpu = getAccUPartition(HyP);
+
+                if (jj_cpu > 0)
+                    jj_cpu = HyP->num_u_blks_Phi;
+
+                /* Sherry force this --> */
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                jj_cpu = 0;
+            }
+#endif
+
+            // int_t jj_cpu = HyP->num_u_blks_Phi-1;
+            // if (HyP->Rnbrow > 0 && jj_cpu>=0)
+            //     HyP->offloadCondition = 1;
+            // else
+            //     HyP->offloadCondition = 0;
+            //     jj_cpu=0;
+#if 0
+	    if ( HyP->offloadCondition ) {
+	    printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n",
+		   grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi ,
+		   HyP->num_u_blks, HyP->num_u_blks_Phi,
+		   jj_cpu, HyP->offloadCondition);
+	    fflush(stdout);
+	    }
+#endif
+            scuStatUpdate(SuperSize(k), HyP, SCT, stat);
+
+            int_t offload_condition = HyP->offloadCondition;
+            uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int *indirect = fNlists->indirect;
+            int *indirect2 = fNlists->indirect2;
+
+            /* Schur Complement Update */
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC(k + 1);
+
+            float *bigV = scuBufs->bigV;
+            float *bigU = scuBufs->bigU;
+
+            double t1 = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel /* Look-ahead update on CPU */
+#endif
+            {
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub,
+                                               usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub,
+                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->RemainBlk;
+                    int_t lb = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+            }     /* end parallel region ... end look-ahead update */
+
+            SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+
+            //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+
+            /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1)
+            { /* Not the root */
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
+                { /* if k is the last child in this level */
+                    int_t k0_parent = myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+                        //	      exit(-1);
+                        assert(k0_parent < nnodes);
+                        int offset = k0_parent - k_end;
+                        if (!gpuLUreduced[k_parent] && superlu_acc_offload)
+                        {
+                            double tt_start1 = SuperLU_timer_();
+
+                            sinitD2Hreduce(k_parent, d2Hred, last_flag,
+                                          HyP, sluGPU, grid, LUstruct, SCT);
+                            int_t copyL_kljb = d2Hred->copyL_kljb;
+                            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                            if (copyL_kljb || copyU_kljb)
+                                SCT->PhiMemCpyCounter++;
+                            ssendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU);
+
+                            /* Reduce the LU panels from GPU */
+                            sreduceGPUlu(last_flag, d2Hred,
+                                        sluGPU, SCT, grid, LUstruct);
+
+                            gpuLUreduced[k_parent] = 1;
+                            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                        }
+
+                        /* Factorize diagonal block on CPU */
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+                        sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+                                        dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+                                        comReqss[offset]->U_diag_blk_recv_req,
+                                        comReqss[offset]->L_diag_blk_recv_req,
+                                        comReqss[offset]->U_diag_blk_send_req,
+                                        comReqss[offset]->L_diag_blk_send_req,
+                                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    } /* end if k0_parent > 0 */
+
+                } /* end if all children are done */
+            }     /* end if non-root */
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+            {
+                /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
+#pragma omp master
+#endif
+                {
+                    if (superlu_acc_offload)
+                    {
+#ifdef _OPENMP
+                        int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
+                        double t1 = SuperLU_timer_();
+
+                        if (offload_condition)
+                        {
+                            SCT->datatransfer_count++;
+                            int streamId = k0 % nGPUStreams;
+
+                            /*wait for previous offload to get finished*/
+                            if (sluGPU->lastOffloadStream[streamId] != -1)
+                            {
+                                swaitGPUscu(streamId, sluGPU, SCT);
+                                sluGPU->lastOffloadStream[streamId] = -1;
+                            }
+
+                            int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow;
+                            int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
+                            assert(bigu_send_size < HyP->bigu_size);
+
+                            /* !! Sherry add the test to avoid seg_fault inside
+                                  sendSCUdataHost2GPU */
+                            if (bigu_send_size > 0)
+                            {
+                                ssendSCUdataHost2GPU(streamId, lsub, usub,
+                                              bigU, bigu_send_size,
+                                              Remain_lbuf_send_size, sluGPU, HyP);
+
+                                sluGPU->lastOffloadStream[streamId] = k0;
+                                int_t usub_len = usub[2];
+                                int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+                                //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+                                sSchurCompUpdate_GPU(
+                                    streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                    Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                    HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid);
+                            } /* endif bigu_send_size > 0 */
+
+                            // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
+
+                            SCT->schurPhiCallCount++;
+                            HyP->jj_cpu = jj_cpu;
+                            updateDirtyBit(k0, HyP, grid);
+                        } /* endif (offload_condition) */
+
+                        double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double)(t2 - t1);                                     // Sherry added
+
+                    } /* endif (superlu_acc_offload) */
+
+                } /* end omp master thread */
+		
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                /* The following update is on CPU. Should not be necessary now,
+		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
+                {
+                    //printf(".. WARNING: should NOT get here\n");
+                    int_t j = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+
+            } /* end omp parallel region */
+
+            //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+            Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req,
+                            comReqss[abs_offset]->L_diag_blk_send_req,
+                            grid, SCT);
+#endif
+
+            /*Schedule next I bcasts within look-ahead window */
+            for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                                      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+                                      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+                                      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf,
+                                      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            } /* end for look-ahead window */
+
+            if (topoLvl < maxTopoLevel - 1) /* not root */
+            {
+                /*look-ahead LU factorization*/
+                int kx_st = eTreeTopLims[topoLvl + 1];
+                int kx_end = eTreeTopLims[topoLvl + 2];
+                for (int k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int kx = perm_c_supno[k0x];
+                    int offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+                            sLPanelTrSolve(kx, factStat->factored_L,
+                                           dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within look-ahead window
+                                factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt,
+                                                  comReqss[offset1]->send_req,
+                                                  comReqss[offset1]->recv_req,
+                                                  LUvsbs[offset1]->Lsub_buf,
+                                                  LUvsbs[offset1]->Lval_buf,
+                                                  factStat->factored,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+                            sUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor,
+                                           scuBufs->bigV,
+                                           ldt, packLUInfo->Ublock_info,
+                                           grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within lookahead window
+                                factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt,
+                                                  comReqss[offset]->send_requ,
+                                                  comReqss[offset]->recv_requ,
+                                                  LUvsbs[offset]->Usub_buf,
+                                                  LUvsbs[offset]->Uval_buf,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                } /* end look-ahead */
+
+            } /* end if non-root level */
+
+            /* end Schur complement update */
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+        } /* end Schur update for all the nodes in level 'topoLvl' */
+
+    } /* end for all levels of the tree */
+
+    return 0;
+} /* end ssparseTreeFactor_ASYNC_GPU */
+
+#endif // matching: enable GPU
diff --git a/SRC/strfAux.c b/SRC/strfAux.c
new file mode 100644
index 00000000..6a26e5e6
--- /dev/null
+++ b/SRC/strfAux.c
@@ -0,0 +1,758 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Auxiliary routine for 3D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* sInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void sinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, sLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    sinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t sSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    float *bigU,
+    int_t* Lsub_buf,
+    float *Lval_buf,
+    int_t* Usub_buf,
+    float *Uval_buf,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        sgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* sSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t sSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    sscuBufs_t* scuBufs, sLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, sLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    float * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    float* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        sRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            sRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* sSchurComplementSetupGPU */
+
+
+float* sgetBigV(int_t ldt, int_t num_threads)
+{
+    float *bigV;
+    if (!(bigV = floatMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+float* sgetBigU(int_t nsupers, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    float* bigU = floatMalloc_dist(bigu_size);
+
+    return bigU;
+} /* sgetBigU */
+
+
+trf3Dpartition_t* sinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      sLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter sinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    sinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    sLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+    sLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit sinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* sinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void sDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter sDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit sDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/strfCommWrapper.c b/SRC/strfCommWrapper.c
new file mode 100644
index 00000000..de9cc01f
--- /dev/null
+++ b/SRC/strfCommWrapper.c
@@ -0,0 +1,548 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t sDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     float *BlockUFactor,
+                     float *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        sIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        sIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Sgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        sPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        sISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        sISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t sLPanelTrSolve( int_t k,   int_t* factored_L,
+		      float* BlockUFactor,
+		      gridinfo_t *grid,
+		      sLUstruct_t *LUstruct)
+{
+    double alpha = 1.0;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* sLPanelTrSolve */
+
+int_t sLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    float* BlockUFactor,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    sUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    sLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* sLPanelUpdate */
+
+#define BL  32
+
+int_t sUPanelTrSolve( int_t k,  
+                     float* BlockLFactor,
+                     float* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        float *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        float *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                float *tempv = bigV +  thread_id * ldt * ldt;
+                sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        float *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            float *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            float *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    float *tempv = bigV +  thread_id * ldt * ldt;
+                    sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* sUPanelTrSolve */
+
+int_t sUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    float* BlockLFactor,
+                    float* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    sUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t sIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    float* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        float* lusup = Lnzval_bc_ptr[lk];
+
+        sIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            sIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t sIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, float* Uval_buf,
+    			     gridinfo_t *grid, sLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        float* uval = Unzval_br_ptr[lk];
+        sIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            sIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            sWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            sWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/superlu_FCnames.h b/SRC/superlu_FCnames.h
index 9bd95e2a..97b711a0 100644
--- a/SRC/superlu_FCnames.h
+++ b/SRC/superlu_FCnames.h
@@ -24,49 +24,75 @@ at the top-level directory.
 #define __SUPERLU_FCNAMES
 
 /* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE)
-#define f_create_options_handle          FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE)
-#define f_create_ScalePerm_handle        FC_GLOBAL(f_create_scaleperm_handle,F_CREATE_SCALEPERM_HANDLE)
-#define f_create_LUstruct_handle         FC_GLOBAL(f_create_lustruct_handle,F_CREATE_LUSTRUCT_HANDLE)
-#define f_create_SOLVEstruct_handle      FC_GLOBAL(f_create_solvestruct_handle,F_CREATE_SOLVESTRUCT_HANDLE)
-#define f_create_SuperMatrix_handle      FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE)
-#define f_destroy_gridinfo_handle        FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE)
-#define f_destroy_options_handle         FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE)
-#define f_destroy_ScalePerm_handle       FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE)
-#define f_destroy_LUstruct_handle        FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE)
-#define f_destroy_SOLVEstruct_handle     FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE)
-#define f_destroy_SuperMatrix_handle     FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE)
-#define f_create_SuperLUStat_handle      FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE)
-#define f_destroy_SuperLUStat_handle     FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE)
-#define f_get_gridinfo                   FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO)
-#define f_get_SuperMatrix                FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX)
-#define f_set_SuperMatrix                FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX)
-#define f_get_CompRowLoc_Matrix          FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX)
-#define f_set_CompRowLoc_Matrix          FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX)
-#define f_get_superlu_options            FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS)
-#define f_set_superlu_options            FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS)
-#define f_set_default_options            FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS)
-#define f_superlu_gridinit               FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT)
-#define f_superlu_gridmap                FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP)
-#define f_superlu_gridexit               FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT)
-#define f_ScalePermstructInit            FC_GLOBAL(f_scalepermstructinit,F_SCALEPERMSTRUCTINIT)
-#define f_ScalePermstructFree            FC_GLOBAL(f_scalepermstructfree,F_SCALEPERMSTRUCTFREE)
-#define f_PStatInit                      FC_GLOBAL(f_pstatinit,F_PSTATINIT)
-#define f_PStatFree                      FC_GLOBAL(f_pstatfree,F_PSTATFREE)
-#define f_LUstructInit                   FC_GLOBAL(f_lustructinit,F_LUSTRUCTINIT)
-#define f_LUstructFree                   FC_GLOBAL(f_lustructfree,F_LUSTRUCTFREE)
-#define f_Destroy_LU                     FC_GLOBAL(f_destroy_lu,F_DESTROY_LU)
-#define f_dCreate_CompRowLoc_Mat_dist    FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST)
-#define f_zCreate_CompRowLoc_Mat_dist    FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST)
-#define f_Destroy_CompRowLoc_Mat_dist    FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST)
-#define f_Destroy_SuperMat_Store_dist    FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST)
-#define f_dSolveFinalize                 FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE)
-#define f_zSolveFinalize                 FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE)
-#define f_pdgssvx                        FC_GLOBAL(f_pdgssvx,F_PDGSSVX)
-#define f_pzgssvx                        FC_GLOBAL(f_pzgssvx,F_PZGSSVX)
-#define f_dcreate_dist_matrix            FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX)
-#define f_zcreate_dist_matrix            FC_GLOBAL(f_zcreate_dist_matrix,F_ZCREATE_DIST_MATRIX)
-#define f_check_malloc                   FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC)
+#define f_create_gridinfo_handle       FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE)
+#define f_create_gridinfo3d_handle     FC_GLOBAL(f_create_gridinfo3d_handle,F_CREATE_GRIDINFO3D_HANDLE)
+#define f_create_options_handle        FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE)
+#define f_create_SuperMatrix_handle    FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE)
+#define f_destroy_gridinfo_handle      FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE)
+#define f_destroy_options_handle       FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE)
+#define f_destroy_ScalePerm_handle     FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE)
+#define f_destroy_LUstruct_handle      FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE)
+#define f_destroy_SOLVEstruct_handle   FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE)
+#define f_destroy_SuperMatrix_handle   FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE)
+#define f_create_SuperLUStat_handle    FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE)
+#define f_destroy_SuperLUStat_handle   FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE)
+#define f_get_gridinfo                 FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO)
+#define f_get_gridinfo3d               FC_GLOBAL(f_get_gridinfo3d,F_GET_GRIDINFO3D)
+#define f_get_SuperMatrix              FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX)
+#define f_set_SuperMatrix              FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX)
+#define f_get_CompRowLoc_Matrix        FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX)
+#define f_set_CompRowLoc_Matrix        FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX)
+#define f_get_superlu_options          FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS)
+#define f_set_superlu_options          FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS)
+#define f_set_default_options          FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS)
+#define f_superlu_gridinit             FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT)
+#define f_superlu_gridinit3d           FC_GLOBAL(f_superlu_gridinit3d,F_SUPERLU_GRIDINIT3D)
+#define f_superlu_gridmap              FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP)
+#define f_superlu_gridexit             FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT)
+#define f_PStatInit                    FC_GLOBAL(f_pstatinit,F_PSTATINIT)
+#define f_PStatFree                    FC_GLOBAL(f_pstatfree,F_PSTATFREE)
+#define f_Destroy_CompRowLoc_Mat_dist  FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST)
+#define f_Destroy_SuperMat_Store_dist  FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST)
+#define f_check_malloc                 FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC)
+
+////// double
+#define f_dcreate_ScalePerm_handle     FC_GLOBAL(f_dcreate_scaleperm_handle,F_DCREATE_SCALEPERM_HANDLE)
+#define f_dcreate_LUstruct_handle      FC_GLOBAL(f_dcreate_lustruct_handle,F_DCREATE_LUSTRUCT_HANDLE)
+#define f_dcreate_SOLVEstruct_handle   FC_GLOBAL(f_dcreate_solvestruct_handle,F_DCREATE_SOLVESTRUCT_HANDLE)
+#define f_dScalePermstructInit         FC_GLOBAL(f_dscalepermstructinit,F_DSCALEPERMSTRUCTINIT)
+#define f_dScalePermstructFree         FC_GLOBAL(f_dscalepermstructfree,F_DSCALEPERMSTRUCTFREE)
+#define f_dLUstructInit                FC_GLOBAL(f_dlustructinit,F_DLUSTRUCTINIT)
+#define f_dLUstructFree                FC_GLOBAL(f_dlustructfree,F_DLUSTRUCTFREE)
+#define f_dDestroy_LU_SOLVE_struct     FC_GLOBAL(f_ddestroy_lu_solve_struct,F_DDESTROY_LU_SOLVE_STRUCT)
+#define f_dDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_ddestroy_lu_solve_struct_3d,F_DDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_dDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_ddestroy_a3d_gathered_on_2d,F_DDESTROY_A3D_GATHERED_ON_2D)
+
+#define f_dCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST)
+#define f_dSolveFinalize               FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE)
+#define f_pdgssvx                      FC_GLOBAL(f_pdgssvx,F_PDGSSVX)
+#define f_pdgssvx3d                    FC_GLOBAL(f_pdgssvx3d,F_PDGSSVX3D)
+#define f_dcreate_dist_matrix          FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX)
+#define f_dcreate_matrix_x_b           FC_GLOBAL(f_dcreate_matrix_x_b,F_DCREATE_MATRIX_X_B)
+#define f_dcreate_matrix_x_b_3d        FC_GLOBAL(f_dcreate_matrix_x_b_3d,F_DCREATE_MATRIX_X_B_3D)
+
+////// complex16
+#define f_zcreate_ScalePerm_handle     FC_GLOBAL(f_zcreate_scaleperm_handle,F_ZCREATE_SCALEPERM_HANDLE)
+#define f_zcreate_LUstruct_handle      FC_GLOBAL(f_zcreate_lustruct_handle,F_ZCREATE_LUSTRUCT_HANDLE)
+#define f_zcreate_SOLVEstruct_handle   FC_GLOBAL(f_zcreate_solvestruct_handle,F_ZCREATE_SOLVESTRUCT_HANDLE)
+#define f_zScalePermstructInit         FC_GLOBAL(f_zscalepermstructinit,F_ZSCALEPERMSTRUCTINIT)
+#define f_zScalePermstructFree         FC_GLOBAL(f_zscalepermstructfree,F_ZSCALEPERMSTRUCTFREE)
+#define f_zLUstructInit                FC_GLOBAL(f_zlustructinit,F_ZLUSTRUCTINIT)
+#define f_zLUstructFree                FC_GLOBAL(f_zlustructfree,F_ZLUSTRUCTFREE)
+#define f_zDestroy_LU_SOLVE_struct     FC_GLOBAL(f_zdestroy_lu_solve_struct,F_ZDESTROY_LU_SOLVE_STRUCT)
+#define f_zDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_zdestroy_lu_solve_struct_3d,F_ZDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_zDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_zdestroy_a3d_gathered_on_2d,F_ZDESTROY_A3D_GATHERED_ON_2D)
+
+#define f_zCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST)
+#define f_zSolveFinalize               FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE)
+#define f_pzgssvx                      FC_GLOBAL(f_pzgssvx,F_PZGSSVX)
+#define f_pzgssvx3d                    FC_GLOBAL(f_pzgssvx3d,F_PZGSSVX3D)
+#define f_zcreate_matrix_x_b           FC_GLOBAL(f_zcreate_matrix_x_b,F_ZCREATE_MATRIX_X_B)
+#define f_zcreate_matrix_x_b_3d        FC_GLOBAL(f_zcreate_matrix_x_b_3d,F_ZCREATE_MATRIX_X_B_3D)
 
 /* BLAS */
 #define sasum_    FC_GLOBAL(sasum,SASUM)
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index e258745b..9f94c037 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -334,9 +334,12 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0.
+                            This needs to be peresistent between
+			    3D factorization and solve.  */
 } dSOLVEstruct_t;
 
-#if 0 
+
 
 /*==== For 3D code ====*/
 
@@ -388,7 +391,7 @@ typedef struct
     int_t bigu_size;
     int_t offloadCondition;
     int_t superlu_acc_offload;
-    int_t nCudaStreams;
+    int_t nGPUStreams;
 } HyP_t;
 
 typedef struct 
@@ -423,13 +426,13 @@ typedef struct
 {
     double *bigU;
     double *bigV;
-} scuBufs_t;
+} dscuBufs_t;
 
 typedef struct
 {   
     double* BlockLFactor;
     double* BlockUFactor;
-} diagFactBufs_t;
+} ddiagFactBufs_t;
 
 typedef struct
 {
@@ -439,7 +442,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
-#endif
+//#endif
 /*=====================*/
 
 /***********************************************************************
@@ -488,7 +491,7 @@ extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
 extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
-extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
+extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
 				  double **, int *, FILE *, char *, gridinfo_t *);
 
 extern void   dScalePermstructInit(const int_t, const int_t, 
@@ -534,6 +537,7 @@ extern void  pdCompute_Diag_Inv(int_t, dLUstruct_t *,gridinfo_t *, SuperLUStat_t
 extern int  dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
 		       int_t, dLUstruct_t *, gridinfo_t *, dSOLVEstruct_t *);
 extern void dSolveFinalize(superlu_dist_options_t *, dSOLVEstruct_t *);
+extern void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *, gridinfo3d_t *);
 extern int_t pdgstrs_init(int_t, int_t, int_t, int_t,
                           int_t [], int_t [], gridinfo_t *grid,
 	                  Glu_persist_t *, dSOLVEstruct_t *);
@@ -558,7 +562,7 @@ extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
                         int_t* lsub, int_t* usub, double* tempv,
                         int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr,
                         gridinfo_t * grid);
-extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
+extern int_t pdgstrf(superlu_dist_options_t *, int, int, double anorm,
 		    dLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
 
 /* #define GPU_PROF
@@ -649,12 +653,13 @@ extern void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
 extern void dZero_CompRowLoc_Matrix_dist(SuperMatrix *);
 extern void dScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, double);
 extern void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, double);
-extern void dZeroLblocks(int, int_t, gridinfo_t *, dLUstruct_t *);
+extern void dZeroLblocks(int, int, gridinfo_t *, dLUstruct_t *);
+extern void dZeroUblocks(int iam, int n, gridinfo_t *, dLUstruct_t *);
 extern void    dfill_dist (double *, int_t, double);
 extern void    dinf_norm_error_dist (int_t, int_t, double*, int_t,
                                      double*, int_t, gridinfo_t*);
 extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
-				double [], int_t , gridinfo_t *);
+				double [], int_t , MPI_Comm);
 extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
 			   double **, int_t **, int_t **);
 extern void  dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
@@ -702,14 +707,12 @@ extern void dgemm_(const char*, const char*, const int*, const int*, const int*,
                   const int*, const double*, double*, const int*, int, int);
 extern void dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*, int, int, int);
-extern void dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*,
-                  int*, int, int, int, int);
-extern void dgemv_(char *, int *, int *, double *, double *a, int *,
-                  double *, int *, double *, double *, int *, int);
-
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
+extern void dtrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const double*, const double*, const int*,
+		  double*, const int*, int, int, int, int);
+extern void dgemv_(const char *, const int *, const int *, const double *,
+                  const double *a, const int *, const double *, const int *,
+		  const double *, double *, const int *, int);
 
 #else
 extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
@@ -717,24 +720,62 @@ extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
                    const int*,  const double*, double*, const int*);
 extern int dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*);
-extern int dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*, int*);
-extern int dgemv_(char *, int *, int *, double *, double *a, int *,
-                  double *, int *, double *, double *, int *);
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
-
+extern int dtrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const double*, const double*, const int*,
+		  double*, const int*);
+extern void dgemv_(const char *, const int *, const int *, const double *,
+                  const double *a, const int *, const double *, const int *,
+		  const double *, double *, const int *);
 #endif
 
-extern int dscal_(int *n, double *da, double *dx, int *incx);
-extern int daxpy_(int *n, double *za, double *zx, 
-	               int *incx, double *zy, int *incy);
+extern void dger_(const int*, const int*, const double*,
+                 const double*, const int*, const double*, const int*,
+		 double*, const int*);
+
+extern int dscal_(const int *n, const double *alpha, double *dx, const int *incx);
+extern int daxpy_(const int *n, const double *alpha, const double *x, 
+	               const int *incx, double *y, const int *incy);
+
+/* SuperLU BLAS interface: dsuperlu_blas.c  */
+extern int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a,
+                  int lda, double *b, int ldb, double beta, double *c, int ldc);
+extern int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag, const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb);
+extern int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda);
+extern int superlu_dscal(const int n, const double alpha, double *x, const int incx);
+extern int superlu_daxpy(const int n, const double alpha,
+    const double *x, const int incx, double *y, const int incy);
+extern int superlu_dgemv(const char *trans, const int m,
+                  const int n, const double alpha, const double *a,
+                  const int lda, const double *x, const int incx,
+                  const double beta, double *y, const int incy);
+extern int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                  int n, double *a, int lda, double *x, int incx);
+
+#ifdef SLU_HAVE_LAPACK
 // LAPACK routine
 extern void dtrtri_(char*, char*, int*, double*, int*, int*);
+#endif
 
-
-#if 0
 /*==== For 3D code ====*/
+extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
+                     int *ldb, double **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d);
+extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
+                           int *ldb, double **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+    
+/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+   it to a NRformat_loc distributed in 2D grid in grid-0 */
+extern void dGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, double *B,
+				   int ldb, int nrhs, gridinfo3d_t *grid3d,
+				   NRformat_loc3d **);
+extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 		       dScalePermstruct_t *, double B[], int ldb, int nrhs,
@@ -750,13 +791,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
     /* from scatter.h */
 extern void
 dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info, double *L_mat, int_t ldl,
-                    double *U_mat, int_t ldu,  double *bigV,
+                    Remain_info_t *Remain_info, double *L_mat, int ldl,
+                    double *U_mat, int ldu,  double *bigV,
                     // int_t jj0,
                     int_t knsupc,  int_t klst,
                     int_t *lsub, int_t *usub, int_t ldt,
                     int_t thread_id,
-                    int_t *indirect, int_t *indirect2,
+                    int *indirect, int *indirect2,
                     int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                     int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
                     int_t *xsup, gridinfo_t *, SuperLUStat_t *
@@ -764,6 +805,8 @@ dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
                     , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                   );
+
+#ifdef _OPENMP
 /*this version uses a lock to prevent multiple thread updating the same block*/
 extern void
 dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
@@ -774,7 +817,7 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          int_t knsupc,  int_t klst,
                          int_t *lsub, int_t *usub, int_t ldt,
                          int_t thread_id,
-                         int_t *indirect, int_t *indirect2,
+                         int *indirect, int *indirect2,
                          int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                          int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
                          int_t *xsup, gridinfo_t *
@@ -782,11 +825,13 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                        );
+#endif
+
 extern int_t
 dblock_gemm_scatterTopLeft( int_t lb,  int_t j, double* bigV,
 				 int_t knsupc,  int_t klst, int_t* lsub,
                                  int_t * usub, int_t ldt,
-				 int_t* indirect, int_t* indirect2,
+				 int* indirect, int* indirect2,
                                  HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                  SCT_t*SCT, SuperLUStat_t *
                                );
@@ -794,21 +839,21 @@ extern int_t
 dblock_gemm_scatterTopRight( int_t lb,  int_t j, double* bigV,
 				  int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,
-				  int_t* indirect, int_t* indirect2,
+				  int* indirect, int* indirect2,
                                   HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                   SCT_t*SCT, SuperLUStat_t * );
 extern int_t
 dblock_gemm_scatterBottomLeft( int_t lb,  int_t j, double* bigV,
 				    int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt, 
-				    int_t* indirect, int_t* indirect2,
+				    int* indirect, int* indirect2,
                                     HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                     SCT_t*SCT, SuperLUStat_t * );
 extern int_t 
 dblock_gemm_scatterBottomRight( int_t lb,  int_t j, double* bigV,
 				     int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,
-				     int_t* indirect, int_t* indirect2,
+				     int* indirect, int* indirect2,
                                      HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                      SCT_t*SCT, SuperLUStat_t * );
 
@@ -841,10 +886,10 @@ extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d
 extern void d3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
 			    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
-extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-		       superlu_dist_options_t *options, dLocalLU_t *Llu,
-		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
-		       int_t * iperm_c_supno);
+//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+//		       superlu_dist_options_t *options, dLocalLU_t *Llu,
+//		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
+//		       int_t * iperm_c_supno);
 
 extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 				  sForest_t**  sForests, dLUstruct_t* LUstruct,
@@ -855,28 +900,6 @@ extern int_t dgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests,
 				   SCT_t* SCT );
 
     /* The following are from pdgstrf2.h */
-#if 0 // Sherry: same routine names, but different code !!!!!!!
-extern void pdgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t,
-                          int_t k, double thresh, Glu_persist_t *,
-			  gridinfo_t *, dLocalLU_t *, MPI_Request *U_diag_blk_send_req,
-			  int tag_ub, SuperLUStat_t *, int *info, SCT_t *);
-#ifdef _CRAY
-void pdgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *,
-                      dLocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-void pdgstrs2_omp (int_t, int_t, int_t, int_t *, double*, Glu_persist_t *, gridinfo_t *,
-                      dLocalLU_t *, SuperLUStat_t *, Ublock_info_t *, double *bigV, int_t ldt, SCT_t *SCT );
-#endif
-
-#else 
-extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k,
-			  double thresh, Glu_persist_t *, gridinfo_t *,
-			  dLocalLU_t *, MPI_Request *, int tag_ub,
-			  SuperLUStat_t *, int *info);
-extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *,
-			 dLocalLU_t *, Ublock_info_t *, SuperLUStat_t *);
-#endif // same routine names   !!!!!!!!
-
 extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr,
 			  int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*);
 extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k,
@@ -891,7 +914,7 @@ extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 			   double* uval, double *tempv);
 extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
                              int_t *usub, double* uval, double *tempv,
-                             int_t knsupc, int_t nsupr, double* lusup,
+                             int_t knsupc, int nsupr, double* lusup,
                              Glu_persist_t *Glu_persist)  ;
 extern void pdgstrs2
 #ifdef _CRAY
@@ -922,9 +945,9 @@ extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruc
 extern int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d);
 /*zero out LU non zero entries*/
 extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , dLUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(dLUstruct_t *);
+extern int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
+extern int dDeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
+extern int dDeAllocGlu_3d(dLUstruct_t *);
 
 /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
@@ -935,7 +958,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver,
                         double* Lval_buf, double* Uval_buf,
                         dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
 /*reduces all nodelists required in a level*/
-int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
+extern int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
                            int_t** treePerm,
                            dLUValSubBuf_t* LUvsb,
                            dLUstruct_t* LUstruct,
@@ -975,21 +998,21 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha,
     /* from communication_aux.h */
 extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup,
 			     gridinfo_t *, int* msgcnt, MPI_Request *,
-			     int_t **ToSendR, int_t *xsup, int );
+			     int **ToSendR, int_t *xsup, int );
 extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup,
-			   gridinfo_t *, int* msgcnt, int_t **ToSendR,
+			   gridinfo_t *, int* msgcnt, int **ToSendR,
 			   int_t *xsup , SCT_t*, int);
 extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
 			    gridinfo_t *, int* msgcnt, MPI_Request *,
-			    int_t *ToSendD, int );
+			    int *ToSendD, int );
 extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
-			   gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int);
+			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
 extern int_t dIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
 			    double* Lval_buf, gridinfo_t *,
 			    MPI_Request *, dLocalLU_t *, int);
 extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*,
 			   dLocalLU_t *, gridinfo_t*, MPI_Request *, int);
-extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR,
+extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
 			MPI_Request *s, SCT_t*);
 extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
 extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *);
@@ -1062,7 +1085,7 @@ extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor,
 			    gridinfo_t *, dLUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, dLUstruct_t *);
+extern int getNsupers(int, Glu_persist_t *);
 extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
 extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
 extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
@@ -1073,33 +1096,29 @@ extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
 				   double* Uval_buf, gridinfo_t *, dLUstruct_t *);
 extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
-				      factNodelists_t*, scuBufs_t*,
+				      factNodelists_t*, dscuBufs_t*,
 				      dLUValSubBuf_t* LUvsb, gridinfo_t *,
 				      dLUstruct_t *, HyP_t*);
 extern double* dgetBigV(int_t, int_t);
 extern double* dgetBigU(int_t, gridinfo_t *, dLUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, dLUstruct_t *);
 // permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      dLUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, dLUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
 extern int_t dLluBufInit(dLUValSubBuf_t*, dLUstruct_t *);
 extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
-			  scuBufs_t*, dLUstruct_t*, gridinfo_t *);
-extern int dfreeScuBufs(scuBufs_t* scuBufs);
+			  dscuBufs_t*, dLUstruct_t*, gridinfo_t *);
+extern int dfreeScuBufs(dscuBufs_t* scuBufs);
 
 // the generic tree factoring code 
 extern int_t treeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1115,11 +1134,11 @@ extern int_t dsparseTreeFactor(
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     treeTopoInfo_t* treeTopoInfo,
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1134,11 +1153,11 @@ extern int_t ddenseTreeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1152,11 +1171,11 @@ extern int_t ddenseTreeFactor(
 extern int_t dsparseTreeFactor_ASYNC(
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1170,9 +1189,9 @@ extern int_t dsparseTreeFactor_ASYNC(
 );
 extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct);
 extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs);
-extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
-extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs);
-extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
+extern ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
+extern int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs);
+extern int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf);
 extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
 			    gridinfo_t *grid, SCT_t *SCT);
 extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
@@ -1182,11 +1201,11 @@ extern int_t ancestorFactor(
     int_t ilvl,             // level of factorization 
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1198,8 +1217,8 @@ extern int_t ancestorFactor(
     double thresh,  SCT_t *SCT, int tag_ub, int *info
 );
 
-/*=====================*/
-#endif  // end 3D prototypes
+/*== end 3D prototypes ===================*/
+
 
 #ifdef __cplusplus
   }
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index c887df04..d71b930e 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -23,6 +23,9 @@ at the top-level directory.
  *     February 8, 2019    version 6.1.1
  *     November 12, 2019   version 6.2.0
  *     October 23, 2020    version 6.4.0
+ *     May 12, 2021        version 7.0.0
+ *     October 5, 2021     version 7.1.0
+ *     October 18, 2021    version 7.1.1
  * </pre>
  */
 
@@ -47,9 +50,10 @@ at the top-level directory.
 #include <stdio.h>
 #include <limits.h>
 #include <string.h>
-//#include <stdatomic.h>
+// #include <stdatomic.h>
 #include <math.h>
 #include <stdint.h>
+//#include <malloc.h>  Sherry: not available on Mac OS
 // /* Following is for vtune */
 // #if 0
 // #include <ittnotify.h>
@@ -72,10 +76,10 @@ at the top-level directory.
  *   #endif
  * Versions 4.x and earlier do not include a #define'd version numbers.
  */
-#define SUPERLU_DIST_MAJOR_VERSION     6
-#define SUPERLU_DIST_MINOR_VERSION     4
-#define SUPERLU_DIST_PATCH_VERSION     0
-#define SUPERLU_DIST_RELEASE_DATE      "October 23, 2020"
+#define SUPERLU_DIST_MAJOR_VERSION     7
+#define SUPERLU_DIST_MINOR_VERSION     1
+#define SUPERLU_DIST_PATCH_VERSION     1
+#define SUPERLU_DIST_RELEASE_DATE      "October 18, 2021"
 
 #include "superlu_dist_config.h"
 
@@ -103,7 +107,7 @@ at the top-level directory.
 #elif defined (_LONGINT)
   typedef int64_t int_t;
   #define mpi_int_t   MPI_LONG_LONG_INT
-  #define IFMT "%ld"
+  #define IFMT "%lld"
 #else /* Default */
   typedef int int_t;
   #define mpi_int_t   MPI_INT
@@ -111,7 +115,6 @@ at the top-level directory.
 #endif
 
 
-
 /* MPI C complex datatype */
 #define SuperLU_MPI_COMPLEX         MPI_C_COMPLEX 
 #define SuperLU_MPI_DOUBLE_COMPLEX  MPI_C_DOUBLE_COMPLEX
@@ -122,13 +125,20 @@ typedef MPI_C_DOUBLE_COMPLEX  SuperLU_MPI_DOUBLE_COMPLEX;
 */
 
 #include "superlu_FortranCInterface.h"
-//#include "Cnames.h"
 #include "superlu_FCnames.h"
 #include "superlu_enum_consts.h"
 #include "supermatrix.h"
 #include "util_dist.h"
 #include "psymbfact.h"
 
+#ifdef GPU_ACC
+#include <cuda.h>
+#endif
+
+
+#define MAX_SUPER_SIZE 512   /* Sherry: moved from superlu_gpu.cu */
+
+
 #define ISORT     /* NOTE: qsort() has bug on Mac */
 
 /*********************************************************************** 
@@ -193,7 +203,7 @@ typedef MPI_C_DOUBLE_COMPLEX  SuperLU_MPI_DOUBLE_COMPLEX;
  * 0,1: for sending L to "right"                                             *
  * 2,3: for sending off-diagonal blocks of U "down"                          *
  * 4  : for sending the diagonal blcok down (in pxgstrf2)                    */
-#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
+//#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
 
     /* For numeric factorization. */
 #if 0
@@ -340,16 +350,40 @@ typedef struct {
     int Iam;              /* my process number */
 } superlu_scope_t;
 
-/*-- Process grid definition */
+/*-- 2D process grid definition */
 typedef struct {
     MPI_Comm comm;        /* MPI communicator */
     superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */
     superlu_scope_t cscp; /* process scope in columnwise, vertical direction */
-    int iam;              /* my process number in this scope */
+    int iam;              /* my process number in this grid */
     int_t nprow;          /* number of process rows */
     int_t npcol;          /* number of process columns */
 } gridinfo_t;
 
+/*-- 3D process grid definition */
+typedef struct {
+    MPI_Comm comm;        /* MPI communicator */
+    superlu_scope_t rscp; /* row scope */
+    superlu_scope_t cscp; /* column scope */
+    superlu_scope_t zscp; /* scope in third dimension */
+    gridinfo_t grid2d;    /* for using 2D functions */
+    int iam;              /* my process number in this grid */
+    int_t nprow;          /* number of process rows */
+    int_t npcol;          /* number of process columns */
+    int_t npdep;          /* number of replication factor in Z-dimension  */
+    int rankorder;        /* = 0: Z-major ( default )
+			   *    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+			   *                     0      3      6      9
+			   *                     1      4      7      10      
+			   *                     2      5      8      11
+			   * = 1: XY-major (need set env. var.: RANKORDER=XY)
+			   *    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+			   *                     0      1      2      4
+			   *                     5      6      7      8
+			   *                     9      10     11     12
+			   */
+} gridinfo3d_t;
+
 
 /*
  *-- The structures are determined by SYMBFACT and used thereafter.
@@ -670,12 +704,13 @@ typedef struct {
     yes_no_t      lookahead_etree; /* use etree computed from the
 				      serial symbolic factorization */
     yes_no_t      SymPattern;      /* symmetric factorization          */
+    yes_no_t      Algo3d;          /* use 3D factorization/solve algorithms */
 } superlu_dist_options_t;
 
 typedef struct {
     float for_lu;
     float total;
-    int_t expansions;
+    int expansions;
     int64_t nnzL, nnzU;
 } superlu_dist_mem_usage_t;
 
@@ -693,12 +728,18 @@ typedef struct {
     int_t iukp;
     int_t jb;
     int_t full_u_cols;
+    int_t eo; /* order of elimination. For 3D algorithm */
+    int_t ncols;
+    int_t StCol;
 } Ublock_info_t;
 
 typedef struct {
     int_t lptr;
     int_t ib;
+    int_t eo; /* order of elimination, for 3D code */
+    int_t nrows;
     int_t FullRow;
+    int_t StRow;
 } Remain_info_t;
 
 typedef struct
@@ -715,6 +756,195 @@ struct superlu_pair
 
 /**--------**/
 
+/*==== For 3D code ====*/
+
+/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
+ * for each supernodal column, the five communications are:                  *
+ * 0,1: for sending L to "right"                                             *
+ * 2,3: for sending off-diagonal blocks of U "down"                          *
+ * 4  : for sending the diagonal blcok down (in pxgstrf2)                    */
+// int tag_ub;
+// #define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
+
+// #undef SLU_MPI_TAG
+/*defining my own MPI tags */
+/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
+ * for each supernodal column, the five communications are:                  *
+ * 0,1: for sending L to "right"                                             *
+ * 2,3: for sending off-diagonal blocks of U "down"                          *
+ * 4  : for sending the diagonal blcok down (in pxgstrf2)                    *
+ * 5  : for sending the diagonal L block right () : added by piyush */
+#define SLU_MPI_TAG(id,num) ( (6*(num)+id) % tag_ub )
+
+/*structs for quick look up */
+typedef struct
+{
+    int_t luptrj;
+    int_t lptrj;
+    int_t lib;
+} local_l_blk_info_t;
+ 
+typedef struct
+{
+    int_t iuip;
+    int_t ruip;
+    int_t ljb;
+} local_u_blk_info_t;
+
+
+//global variable 
+extern double CPU_CLOCK_RATE;
+
+typedef struct
+{
+    int_t *perm_c_supno;
+    int_t *iperm_c_supno;
+}  perm_array_t;
+
+typedef struct
+{   
+    int_t* factored;
+    int_t* factored_D;
+    int_t* factored_L;
+    int_t* factored_U;
+    int_t* IrecvPlcd_D;
+    int_t* IbcastPanel_L;         /*I bcast and recv placed for the k-th L panel*/
+    int_t* IbcastPanel_U;         /*I bcast and recv placed for the k-th U panel*/
+    int_t* numChildLeft;            /*number of children left to be factored*/
+    int_t* gpuLUreduced;          /*New for GPU acceleration*/
+}factStat_t;
+
+typedef struct
+{
+    int_t next_col;
+    int_t next_k;
+    int_t kljb;
+    int_t kijb;
+    int_t copyL_kljb;
+    int_t copyU_kljb;
+    int_t l_copy_len;
+    int_t u_copy_len;
+    int_t *kindexL;
+    int_t *kindexU;
+    int_t mkrow;
+    int_t mkcol;
+    int_t ksup_size;
+} d2Hreduce_t;
+
+typedef struct{
+	int_t numChild;
+	int_t numDescendents;
+	int_t left;
+	int_t right;
+	int_t extra;
+	int_t* childrenList;
+	int_t depth; 		// distance from the top
+	double weight; 		// weight of the supernode
+	double iWeight; 	// weight of the whole subtree below
+	double scuWeight; 	// weight of schur complement update = max|n_k||L_k||U_k|
+} treeList_t;
+
+typedef struct 
+{
+	int_t numLvl;  // number of level in tree;
+	int_t* eTreeTopLims; // boundaries of each level  of size 
+	int_t* myIperm;		// Iperm for my tree size nsupers;
+	
+} treeTopoInfo_t;
+
+typedef struct 
+{
+	int_t* setree; 		// global supernodal elimination tree
+	int_t* numChildLeft;
+} gEtreeInfo_t; 
+
+typedef enum treePartStrat{
+	ND,				// nested dissection ordering or natural ordering
+	GD 				// greedy load balance stregy
+}treePartStrat;
+
+typedef struct
+{
+	/* data */
+	int_t nNodes; 			// total number of nodes
+	int_t* nodeList;		// list of nodes, should be in order of factorization
+#if 0 // Sherry: the following array is used on rForest_t. ???
+	int_t* treeHeads;
+#endif
+                            /*topological information about the tree*/
+	int_t numLvl;  			// number of Topological levels in the forest
+	int_t numTrees; 		// number of tree in the forest
+	treeTopoInfo_t  topoInfo; //
+#if 0  // Sherry fix: the following two structures are in treeTopoInfo_t. ???
+	int_t* eTreeTopLims; 	// boundaries of each level  of size 
+	int_t* myIperm;			// Iperm for my tree size nsupers;
+#endif
+
+	/*information about load balance*/
+	double weight;		// estimated cost 
+	double cost; 		// measured cost
+
+} sForest_t;
+
+typedef struct
+{
+    /* data */
+    MPI_Request* L_diag_blk_recv_req;
+    MPI_Request* L_diag_blk_send_req;
+    MPI_Request* U_diag_blk_recv_req;
+    MPI_Request* U_diag_blk_send_req;
+    MPI_Request* recv_req;
+    MPI_Request* recv_requ;
+    MPI_Request* send_req;
+    MPI_Request* send_requ;
+} commRequests_t;
+
+typedef struct
+{
+    int_t *iperm_c_supno;
+    int_t *iperm_u;
+    int_t *perm_u;
+    int *indirect;
+    int *indirect2;
+    
+} factNodelists_t;
+
+typedef struct
+{
+    int* msgcnt;
+    int* msgcntU;
+} msgs_t;
+
+typedef struct xtrsTimer_t
+{
+    double trsDataSendXY;
+    double trsDataSendZ;
+    double trsDataRecvXY;
+    double trsDataRecvZ;
+    double t_pdReDistribute_X_to_B;
+    double t_pdReDistribute_B_to_X;
+    double t_forwardSolve;
+    double tfs_compute;
+    double tfs_comm;
+    double t_backwardSolve;
+    double tbs_compute;
+    double tbs_comm;
+    double tbs_tree[2*MAX_3D_LEVEL];
+    double tfs_tree[2*MAX_3D_LEVEL];
+    
+    // counters for communication and computation volume 
+    
+    int_t trsMsgSentXY;
+    int_t trsMsgSentZ;
+    int_t trsMsgRecvXY;
+    int_t trsMsgRecvZ;
+    
+    double ppXmem;		// perprocess X-memory
+} xtrsTimer_t;
+
+/*==== For 3D code ====*/
+
+/*====================*/
 
 /***********************************************************************
  * Function prototypes
@@ -724,11 +954,14 @@ struct superlu_pair
 extern "C" {
 #endif
 
-extern void   set_default_options_dist(superlu_dist_options_t *);
-extern void   superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *);
-extern void   superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t,
-			      gridinfo_t *);
+extern void   superlu_gridinit(MPI_Comm, int, int, gridinfo_t *);
+extern void   superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *);
 extern void   superlu_gridexit(gridinfo_t *);
+extern void   superlu_gridinit3d(MPI_Comm Bcomm,  int nprow, int npcol, int npdep,
+				 gridinfo3d_t *grid) ;
+extern void   superlu_gridexit3d(gridinfo3d_t *grid);
+
+extern void   set_default_options_dist(superlu_dist_options_t *);
 extern void   print_options_dist(superlu_dist_options_t *);
 extern void   print_sp_ienv_dist(superlu_dist_options_t *);
 extern void   Destroy_CompCol_Matrix_dist(SuperMatrix *);
@@ -777,7 +1010,7 @@ extern int_t estimate_bigu_size (int_t, int_t **, Glu_persist_t *,
 /* Auxiliary routines */
 extern double SuperLU_timer_ ();
 extern void   superlu_abort_and_exit_dist(char *);
-extern int_t  sp_ienv_dist (int_t);
+extern int    sp_ienv_dist (int);
 extern void   ifill_dist (int_t *, int_t, int_t);
 extern void   super_stats_dist (int_t, int_t *);
 extern void  get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *,
@@ -824,6 +1057,18 @@ extern int_t psymbfact_LUXpand_RL
 extern int_t psymbfact_prLUXpand
 (int_t,  int_t, int, Llu_symbfact_t *, psymbfact_stat_t *);
 
+#ifdef ISORT
+extern void isort (int_t N, int_t *ARRAY1, int_t *ARRAY2);
+extern void isort1 (int_t N, int_t *ARRAY);
+#else
+int superlu_sort_perm (const void *arg1, const void *arg2)
+{
+    const int_t *val1 = (const int_t *) arg1;
+    const int_t *val2 = (const int_t *) arg2;
+    return (*val2 < *val1);
+}
+#endif
+
 #ifdef GPU_ACC   /* GPU related */
 extern void gemm_division_cpu_gpu (int *, int *, int *, int,
 				   int, int, int *, int);
@@ -831,12 +1076,16 @@ extern int_t get_gpublas_nb ();
 extern int_t get_num_gpu_streams ();
 #endif
 
+extern double estimate_cpu_time(int m, int n , int k);
+
 extern int get_thread_per_process();
 extern int_t get_max_buffer_size ();
 extern int_t get_min (int_t *, int_t);
 extern int compare_pair (const void *, const void *);
 extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t,
 			       int_t *, int_t *, int);
+extern int get_acc_offload();
+    
 
 /* Routines for debugging */
 extern void  print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *);
@@ -849,7 +1098,6 @@ extern int   file_PrintInt10(FILE *, char *, int_t, int_t *);
 extern int   file_PrintInt32(FILE *, char *, int, int *);
 extern int   file_PrintLong10(FILE *, char *, int_t, int_t *);
 
-
 /* Routines for Async_tree communication*/
 
 #ifndef __SUPERLU_ASYNC_TREE /* allow multiple inclusions */
@@ -886,6 +1134,167 @@ extern yes_no_t C_BcTree_IsRoot(C_Tree* tree);
 extern void C_BcTree_forwardMessageSimple(C_Tree* tree, void* localBuffer, int msgSize);
 extern void C_BcTree_waitSendRequest(C_Tree* tree);
 
+/*==== For 3D code ====*/
+    
+extern void DistPrint(char* function_name,  double value, char* Units, gridinfo_t* grid);
+extern void DistPrint3D(char* function_name,  double value, char* Units, gridinfo3d_t* grid3d);
+extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT);
+extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT);
+
+// permutation from superLU default
+extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
+			      int_t *etree, Glu_persist_t *Glu_persist, 
+			      int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+			      gridinfo_t *);
+
+/* Manipulate counters */
+extern void SCT_init(SCT_t*);
+extern void SCT_print(gridinfo_t *grid, SCT_t* SCT);
+extern void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT);
+extern void SCT_free(SCT_t*);
+
+extern treeList_t* setree2list(int_t nsuper, int_t* setree );
+extern int  free_treelist(int_t nsuper, treeList_t* treeList);
+
+// int_t calcTreeWeight(int_t nsupers, treeList_t* treeList, int_t* xsup);
+extern int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup);
+extern int_t getDescendList(int_t k, int_t*dlist,  treeList_t* treeList);
+extern int_t getCommonAncestorList(int_t k, int_t* alist,  int_t* seTree, treeList_t* treeList);
+extern int_t getCommonAncsCount(int_t k, treeList_t* treeList);
+extern int_t* getPermNodeList(int_t nnode, 	// number of nodes
+			      int_t* nlist, int_t* perm_c_sup,int_t* iperm_c_sup);
+extern int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder);
+extern int_t* getSubTreeRoots(int_t k, treeList_t* treeList);
+// int_t* treeList2perm(treeList_t* , ..);
+extern int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms);
+// returns a concatenated permutation for three permutation arrays
+
+extern int_t* getGlobal_iperm(int_t nsupers, int_t nperms, int_t** perms,
+			      int_t* nnodes);
+extern int_t log2i(int_t index);
+extern int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup);
+extern int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount);
+extern int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel);
+
+/*takes supernodal elimination tree and for each 
+  supernode calculates "level" in elimination tree*/
+extern int_t* topological_ordering(int_t nsuper, int_t* setree);
+extern int_t* Etree_LevelBoundry(int_t* perm,int_t* tsort_etree, int_t nsuper);
+
+/*calculated boundries of the topological levels*/
+extern int_t* calculate_num_children(int_t nsuper, int_t* setree);
+extern void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper);
+extern void print_etree_leveled(int_t *setree,  int_t* tsort_etree, int_t nsuper);
+extern void print_etree(int_t *setree, int_t* iperm, int_t nsuper);
+extern int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree);
+int* getLastDepBtree( int_t nsupers, treeList_t* treeList);
+
+/*returns array R with of size maxLevel with either 0 or 1
+	R[i] = 1; then Tree[level-i] is set to zero= to only 
+	accumulate the results   */
+extern int_t* getReplicatedTrees( gridinfo3d_t* grid3d);
+
+/*returns indices in gNodeList of trees that belongs to my layer*/
+extern int_t* getGridTrees( gridinfo3d_t* grid3d);
+
+
+/*returns global nodelist*/
+extern int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes,
+			int_t* treeHeads, treeList_t* treeList);
+
+/* calculate number of nodes in subtrees starting from treeHead[i]*/
+extern int_t* calcNumNodes(int_t maxLvl,  int_t* treeHeads, treeList_t* treeList);
+
+/*Returns list of (last) node of the trees */
+extern int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList);
+
+extern int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm);
+
+extern int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree );
+
+extern int_t* getMyEtLims(int_t nnodes, int_t* myTopOrder);
+
+
+extern treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t  nsupers,
+                                 int_t* myPerm,int_t* setree);
+
+extern sForest_t**  getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
+
+extern int_t** getTreePermForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+				  sForest_t*  sForests,
+				  int_t* perm_c_supno, int_t* iperm_c_supno,
+				  gridinfo3d_t* grid3d);
+extern int_t** getTreePermFr( int_t* myTreeIdxs,
+			      sForest_t**  sForests, gridinfo3d_t* grid3d);
+extern int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs,
+				sForest_t**  sForests);
+extern int_t** getNodeListFr(int_t maxLvl, sForest_t**  sForests);
+extern int_t*  getNodeCountsFr(int_t maxLvl, sForest_t**  sForests);
+// int_t* getNodeToForstMap(int_t nsupers, sForest_t**  sForests, gridinfo3d_t* grid3d);
+extern int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm);
+extern void printForestWeightCost(sForest_t**  sForests, SCT_t* SCT, gridinfo3d_t* grid3d);
+extern sForest_t**  getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList);
+extern sForest_t**  getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
+
+    /* from trfAux.h */
+extern int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr);
+extern void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup,
+			 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+			 gridinfo3d_t * grid3d);
+extern int getNsupers(int n, Glu_persist_t *Glu_persist);
+extern int set_tag_ub();
+extern int getNumThreads(int);
+extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup,
+			     gridinfo_t *, int_t *, int_t *);
+#if 0 // Sherry: conflicting with existing routine
+extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+				Glu_persist_t *, gridinfo_t*, int_t* perm_u);
+#endif
+extern int_t* getFactPerm(int_t);
+extern int_t* getFactIperm(int_t*, int_t);
+
+extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid);
+extern int_t initFactStat(int_t nsupers, factStat_t* factStat);
+extern int   freeFactStat(factStat_t* factStat);
+extern int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t*);
+extern int   freeFactNodelists(factNodelists_t* fNlists);
+extern int_t initMsgs(msgs_t* msgs);
+extern int_t getNumLookAhead(superlu_dist_options_t*);
+extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
+extern int   freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss);
+
+extern msgs_t** initMsgsArr(int_t numLA);
+extern int      freeMsgsArr(int_t numLA, msgs_t **msgss);
+
+extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *,
+                                  int_t *usub, Glu_persist_t *, SuperLUStat_t*);
+
+    /* from sec_structs.h */
+extern int Cmpfunc_R_info (const void * a, const void * b);
+extern int Cmpfunc_U_info (const void * a, const void * b);
+extern int sort_R_info( Remain_info_t* Remain_info, int n );
+extern int sort_U_info( Ublock_info_t* Ublock_info, int n );
+extern int sort_R_info_elm( Remain_info_t* Remain_info, int n );
+extern int sort_U_info_elm( Ublock_info_t* Ublock_info, int n );
+
+    /* from pdgstrs.h */
+extern void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d);
+extern void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid);
+
+    /* from p3dcomm.c */
+extern int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                     int_t* nodeCount, int_t** nodeList,
+                     int_t* perm_c_supno, int_t* iperm_c_supno,
+                     gridinfo3d_t* grid3d);
+extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount);
+extern int_t checkIntVector3d(int_t* vec, int_t len,  gridinfo3d_t* grid3d);
+extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d);
+
+  extern int getnGPUStreams();
+  extern int get_mpi_process_per_gpu ();
+
+/*=====================*/
+
 #ifdef __cplusplus
   }
 #endif
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 3f6eaf2b..b99ed621 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -20,5 +20,4 @@
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
-
 #endif
diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in
index 3164559d..9c3142aa 100644
--- a/SRC/superlu_dist_config.h.in
+++ b/SRC/superlu_dist_config.h.in
@@ -20,5 +20,4 @@
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
-
 #endif
diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h
index 4d2a6d03..3103e46e 100644
--- a/SRC/superlu_enum_consts.h
+++ b/SRC/superlu_enum_consts.h
@@ -15,7 +15,6 @@ at the top-level directory.
  * Lawrence Berkeley National Lab, Univ. of California Berkeley, 
  * October 1, 2010
  * January 28, 2018
- * January 28, 2018				   
  *
  */
 
@@ -78,7 +77,7 @@ typedef enum {
     SOL_COMM,/* communication for solve */
     SOL_GEMM,/* gemm for solve */
     SOL_TRSM,/* trsm for solve */
-	SOL_TOT,	/* LU-solve time*/
+    SOL_TOT,	/* LU-solve time*/
     RCOND,   /* estimate reciprocal condition number */
     SOLVE,   /* forward and back solves */
     REFINE,  /* perform iterative refinement */
diff --git a/SRC/superlu_gpu_utils.cu b/SRC/superlu_gpu_utils.cu
new file mode 100644
index 00000000..b472e5b9
--- /dev/null
+++ b/SRC/superlu_gpu_utils.cu
@@ -0,0 +1,94 @@
+#include "superlu_defs.h"
+
+/*error reporting functions */
+gpuError_t checkGPU(gpuError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+    if (result != gpuSuccess) {
+        fprintf(stderr, "GPU Runtime Error: %s\n", gpuGetErrorString(result));
+        assert(result == gpuSuccess);
+    }
+#endif
+    return result;
+}
+
+
+
+__device__ int dnextpow2(int v)
+
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+
+    return v;
+}
+
+
+typedef int pfx_dtype ; 
+__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n)
+{
+    // extern __shared__ pfx_dtype temp[];
+    int n_original = n;
+    n = (n & (n - 1)) == 0? n: dnextpow2(n);
+    int thread_id = threadIdx.x;
+    int offset = 1;
+    if(2*thread_id  < n_original)
+        temp[2*thread_id] = inOutArr[2*thread_id]; 
+    else 
+        temp[2*thread_id] =0;
+
+
+    if(2*thread_id+1 <n_original)
+        temp[2*thread_id+1] = inOutArr[2*thread_id+1];
+    else 
+        temp[2*thread_id+1] =0;
+    
+    for (int d = n>>1; d > 0; d >>= 1) 
+    {
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            temp[bi] += temp[ai];
+        }
+        offset *= 2;
+    }
+    
+    if (thread_id == 0) { temp[n - 1] = 0; } 
+    for (int d = 1; d < n; d *= 2) 
+    {
+        offset >>= 1;
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            pfx_dtype t = temp[ai];
+            temp[ai] = temp[bi];
+            temp[bi] += t;
+        }
+    }
+    __syncthreads();
+    if(2*thread_id  < n_original)
+    inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory
+    if(2*thread_id+1  < n_original)
+    inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1];
+    __syncthreads();
+    
+} /* end incScan */ 
+
+
+#if 0 // Not used
+__global__ void gExScan(pfx_dtype *inArr, int n)
+{
+    extern __shared__ pfx_dtype temp[];
+    incScan(inArr, temp, n);
+    
+}
+#endif
diff --git a/SRC/superlu_gpu_utils.hip.cpp b/SRC/superlu_gpu_utils.hip.cpp
new file mode 100644
index 00000000..23b669dc
--- /dev/null
+++ b/SRC/superlu_gpu_utils.hip.cpp
@@ -0,0 +1 @@
+#include "superlu_gpu_utils.cu"
\ No newline at end of file
diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c
index 4a23cccc..4eff3cb7 100644
--- a/SRC/superlu_grid.c
+++ b/SRC/superlu_grid.c
@@ -12,10 +12,11 @@ at the top-level directory.
  * \brief SuperLU grid utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * September 1, 1999
  * February 8, 2019  version 6.1.1
+ * October 5, 2021
  * </pre>
  */
 
@@ -27,17 +28,22 @@ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL;
 #endif
 
 /*! \brief All processes in the MPI communicator must call this routine.
+ * 
+ *  On output, if a process is not in the SuperLU group, the following 
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
  */
 void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
 					 the new grid is formed. */
-		      int_t nprow, int_t npcol, gridinfo_t *grid)
+		      int nprow, int npcol, gridinfo_t *grid)
 {
     int Np = nprow * npcol;
-    int_t *usermap;
+    int *usermap;
     int i, j, info;
 
     /* Make a list of the processes in the new communicator. */
-    usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t));
+    usermap = SUPERLU_MALLOC(Np*sizeof(int));
     for (j = 0; j < npcol; ++j)
 	for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j;
     
@@ -55,30 +61,38 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
     superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid);
     
     SUPERLU_FREE(usermap);
-
+    
 #ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
- 	int devs, rank;
+    /* Binding each MPI to a GPU device */
+    char *ttemp;
+    ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
+
+    if (ttemp) {
+	int devs, rank;
 	MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
 	gpuGetDeviceCount(&devs);  // Returns the number of compute-capable devices
 	gpuSetDevice(rank % devs); // Set device to be used for GPU executions
+    }
 #endif
-
-
 }
 
 
 /*! \brief All processes in the MPI communicator must call this routine.
+ *
+ *  On output, if a process is not in the SuperLU group, the following 
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
  */
 void superlu_gridmap(
 		     MPI_Comm Bcomm, /* The base communicator upon which
 					the new grid is formed. */
-		     int_t nprow,
-		     int_t npcol,
-		     int_t usermap[], /* usermap(i,j) holds the process
+		     int nprow,
+		     int npcol,
+		     int usermap[], /* usermap(i,j) holds the process
 					 number to be placed in {i,j} of
 					 the process grid.  */
-		     int_t ldumap,    /* The leading dimension of the
+		     int ldumap,    /* The leading dimension of the
 					 2D array usermap[].  */
 		     gridinfo_t *grid)
 {
@@ -118,17 +132,17 @@ void superlu_gridmap(
     MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp );
     /* Create the new communicator. */
     /* NOTE: The call is to be executed by all processes in Bcomm,
-       even if they do not belong in the new group -- superlu_grp. */
+       even if they do not belong in the new group -- superlu_grp.
+       The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */
     MPI_Comm_create( Bcomm, superlu_grp, &grid->comm );
 
-    /* Bail out if I am not in the group, superlu_group. */
+    /* Bail out if I am not in the group "superlu_grp". */
     if ( grid->comm == MPI_COMM_NULL ) {
-	grid->comm = Bcomm;
-	MPI_Comm_rank( Bcomm, &i );
-	grid->iam = i;
-	/*grid->iam = -1;*/
-	SUPERLU_FREE(pranks);
-	return;
+	// grid->comm = Bcomm;  do not need to reassign to a valid communicator
+	grid->iam = -1;
+	//SUPERLU_FREE(pranks);
+	//return;
+	goto gridmap_out;
     }
 
     MPI_Comm_rank( grid->comm, &(grid->iam) );
@@ -176,14 +190,16 @@ void superlu_gridmap(
     }
 #endif
 
+ gridmap_out:        
     SUPERLU_FREE(pranks);
     MPI_Group_free(&superlu_grp);
     MPI_Group_free(&mpi_base_group);
-}
+    
+} /* superlu_gridmap */
 
 void superlu_gridexit(gridinfo_t *grid)
 {
-    if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) {
+    if ( grid->comm != MPI_COMM_NULL ) {
 	/* Marks the communicator objects for deallocation. */
 	MPI_Comm_free( &grid->rscp.comm );
 	MPI_Comm_free( &grid->cscp.comm );
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
new file mode 100644
index 00000000..25f7ba4b
--- /dev/null
+++ b/SRC/superlu_grid3d.c
@@ -0,0 +1,273 @@
+/*! @file
+ * \brief SuperLU grid utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+void superlu_gridmap3d(
+    MPI_Comm Bcomm, /* The base communicator upon which
+                    the new grid is formed. */
+    int nprow,
+    int npcol,
+    int npdep,
+    gridinfo3d_t *grid);
+
+
+/*! \brief All processes in the MPI communicator must call this routine.
+ */
+void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which
+					   the new grid is formed. */
+                        int nprow, int npcol, int npdep, gridinfo3d_t *grid)
+{
+    int Np = nprow * npcol * npdep;
+    int i, j, info;
+
+    /* Make a list of the processes in the new communicator. */
+    //    usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t));
+    //    for (j = 0; j < npcol; ++j)
+    //        for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j;
+
+    /* Check MPI environment initialization. */
+    MPI_Initialized( &info );
+    if ( !info )
+        ABORT("C main program must explicitly call MPI_Init()");
+
+    MPI_Comm_size( Bcomm, &info );
+    if ( info < Np )
+        ABORT("Number of processes is smaller than NPROW * NPCOL * NPDEP");
+
+    superlu_gridmap3d(Bcomm, nprow, npcol, npdep, grid);
+
+    // SUPERLU_FREE(usermap);
+    
+#ifdef GPU_ACC
+    /* Binding each MPI to a GPU device */
+    char *ttemp;
+    ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
+
+    if (ttemp) {
+	int devs, rank;
+	MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
+	gpuGetDeviceCount(&devs);  // Returns the number of compute-capable devices
+	gpuSetDevice(rank % devs); // Set device to be used for GPU executions
+    }
+#endif
+}
+
+
+/*! \brief All processes in the MPI communicator must call this routine.
+ *  On output, if a process is not in the SuperLU group, the following
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
+ */
+void superlu_gridmap3d(
+    MPI_Comm Bcomm, /* The base communicator upon which
+		       the new grid is formed. */
+    int nprow,
+    int npcol,
+    int npdep,
+    gridinfo3d_t *grid)
+{
+    MPI_Group mpi_base_group, superlu_grp;
+    int Np = nprow * npcol * npdep, mycol, myrow;
+    int *pranks;
+    int i, j, info;
+
+#if 0 // older MPI doesn't support complex in C    
+    /* Create datatype in C for MPI complex. */
+    if ( SuperLU_MPI_DOUBLE_COMPLEX == MPI_DATATYPE_NULL ) {
+        MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX );
+        MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX );
+    }
+#endif
+    
+    /* Check MPI environment initialization. */
+    MPI_Initialized( &info );
+    if ( !info )
+        ABORT("C main program must explicitly call MPI_Init()");
+
+    /* Make a list of the processes in the new communicator. */
+    pranks = (int *) SUPERLU_MALLOC(Np * sizeof(int));
+    for (j = 0; j < Np; ++j)
+        pranks[j] = j;
+
+    /*
+     * Form MPI communicator for all.
+     */
+    /* Get the group underlying Bcomm. */
+    MPI_Comm_group( Bcomm, &mpi_base_group );
+    /* Create the new group. */
+    MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp );
+    /* Create the new communicator. */
+    /* NOTE: The call is to be executed by all processes in Bcomm,
+       even if they do not belong in the new group -- superlu_grp.
+       The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */
+    MPI_Comm_create( Bcomm, superlu_grp, &grid->comm );
+
+    /* Bail out if I am not in the group, superlu_group. */
+    if ( grid->comm == MPI_COMM_NULL ) {
+        //grid->comm = Bcomm; do not need to reassign to a valid communicator
+        grid->iam = -1;
+        //SUPERLU_FREE(pranks);
+        //return;
+	goto gridmap_out;
+    }
+
+    grid->nprow = nprow;
+    grid->npcol = npcol;
+    grid->npdep = npdep;
+
+    /* Create 3D grid */
+    int ndim = 3;
+    int dims[3];
+    int reorder = 1;
+    int periodic[] = {0, 0, 0};
+    int coords3d[3];
+    int iam;
+    MPI_Comm superlu3d_comm;
+
+    if (getenv("RANKORDER") && strcmp(getenv("RANKORDER"), "XY" ))
+    {
+	grid->rankorder = 1;  // XY-major 
+
+        dims[0] = nprow;
+        dims[1] = npcol;
+        dims[2] = npdep;
+	
+        // create the new communicator
+        int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm);
+
+        // get the coordinate of the processor
+
+        MPI_Comm_rank (superlu3d_comm, &iam);
+        grid->iam = iam;
+        MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d);
+
+        int rowc[3] = {1, 0, 0};
+        int colc[3] = {0, 1, 0};
+        int depc[3] = {0, 0, 1};
+
+	// Partition a communicator into subgroups which form
+	// lower-dimensional cartesian subgrids
+        MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); /* XZ grids */
+        MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); /* YZ grids */
+        MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); /* XY grids */
+
+        grid->cscp.Np = nprow;
+        grid->cscp.Iam = coords3d[0];
+        grid->rscp.Np = npcol;
+        grid->rscp.Iam = coords3d[1];
+        grid->zscp.Np = npdep;
+        grid->zscp.Iam = coords3d[2];
+
+        //
+        grid->nprow = nprow;
+        grid->npcol = npcol;
+        grid->npdep = npdep;
+
+        // 2D communicator
+        int xyc[3] = {1, 1, 0};
+        MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm));
+
+    } else { /* default */
+        grid->rankorder = 0; // Z-major
+
+        dims[1] = nprow;
+        dims[2] = npcol;
+        dims[0] = npdep;
+
+        // get the communicator
+        int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm);
+
+        //get the coordinate of the processor
+
+        MPI_Comm_rank (superlu3d_comm, &iam);
+        grid->iam = iam;
+        MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d);
+
+	/* printf("(%d) My coordinats are (%d %d %d)\n",
+	   iam, coords3d[0], coords3d[1], coords3d[2] );
+	fflush(stdout);  */
+
+        // create row communicator
+
+        int rowc[3] = {0, 1, 0};
+        int colc[3] = {0, 0, 1};
+        int depc[3] = {1, 0, 0};
+
+        MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm));
+        MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm));
+        MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm));
+
+	//  2x3: 0,2,4 / 1,3,5  column-major
+        grid->cscp.Np = nprow;
+        grid->cscp.Iam = coords3d[1];
+        grid->rscp.Np = npcol;
+        grid->rscp.Iam = coords3d[2];
+
+        grid->zscp.Np = npdep;
+        grid->zscp.Iam = coords3d[0];
+
+        grid->nprow = nprow;
+        grid->npcol = npcol;
+        grid->npdep = npdep;
+
+        // 2D communicator
+        int xyc[3] = {0, 1, 1};
+        MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm));
+
+    } /* if RANKORDER */
+
+
+    // Initialize grid2d;
+
+    grid->grid2d.rscp = grid->rscp;
+    grid->grid2d.cscp = grid->cscp;
+    grid->grid2d.nprow = nprow;
+    grid->grid2d.npcol = npcol;
+    MPI_Comm_rank( grid->grid2d.comm, &(grid->grid2d.iam));
+
+    // grid->grid2d.cscp = grid->cscp;
+
+#if 1
+    if ( (grid->zscp).Iam == 0) {
+      printf("(3d grid: layer 0) iam %d, grid->grid2d.iam %d\n",
+	     grid->iam, (grid->grid2d).iam);
+    } 
+    fflush(stdout);
+#endif
+
+    MPI_Comm_free( &superlu3d_comm );  // Sherry added
+    
+ gridmap_out:    
+    SUPERLU_FREE(pranks);
+    MPI_Group_free( &superlu_grp );
+    MPI_Group_free( &mpi_base_group );
+}
+
+void superlu_gridexit3d(gridinfo3d_t *grid)
+{
+    if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) {
+        /* Marks the communicator objects for deallocation. */
+        MPI_Comm_free( &grid->rscp.comm );
+        MPI_Comm_free( &grid->cscp.comm );
+        MPI_Comm_free( &grid->zscp.comm );
+        MPI_Comm_free( &grid->grid2d.comm );
+        MPI_Comm_free( &grid->comm );
+    }
+#if 0    
+    if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) {
+        MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX );
+	SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; /* some MPI system does not set this
+							   to be NULL after Type_free */
+    }
+#endif    
+}
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 146d3acb..9af2d7ba 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -225,9 +225,12 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pzgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0.
+                            This needs to be peresistent between
+			    3D factorization and solve.  */
 } zSOLVEstruct_t;
 
-#if 0 
+
 
 /*==== For 3D code ====*/
 
@@ -279,7 +282,7 @@ typedef struct
     int_t bigu_size;
     int_t offloadCondition;
     int_t superlu_acc_offload;
-    int_t nCudaStreams;
+    int_t nGPUStreams;
 } HyP_t;
 
 typedef struct 
@@ -314,13 +317,13 @@ typedef struct
 {
     doublecomplex *bigU;
     doublecomplex *bigV;
-} scuBufs_t;
+} zscuBufs_t;
 
 typedef struct
 {   
     doublecomplex* BlockLFactor;
     doublecomplex* BlockUFactor;
-} diagFactBufs_t;
+} zdiagFactBufs_t;
 
 typedef struct
 {
@@ -330,7 +333,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
-#endif
+//#endif
 /*=====================*/
 
 /***********************************************************************
@@ -379,7 +382,7 @@ extern int     zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
 extern int     zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
-extern int 	   zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *,
+extern int zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *,
 				  doublecomplex **, int *, FILE *, char *, gridinfo_t *);
 
 extern void   zScalePermstructInit(const int_t, const int_t, 
@@ -425,6 +428,7 @@ extern void  pzCompute_Diag_Inv(int_t, zLUstruct_t *,gridinfo_t *, SuperLUStat_t
 extern int  zSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
 		       int_t, zLUstruct_t *, gridinfo_t *, zSOLVEstruct_t *);
 extern void zSolveFinalize(superlu_dist_options_t *, zSOLVEstruct_t *);
+extern void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *, gridinfo3d_t *);
 extern int_t pzgstrs_init(int_t, int_t, int_t, int_t,
                           int_t [], int_t [], gridinfo_t *grid,
 	                  Glu_persist_t *, zSOLVEstruct_t *);
@@ -449,7 +453,7 @@ extern void zscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
                         int_t* lsub, int_t* usub, doublecomplex* tempv,
                         int_t ** Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
                         gridinfo_t * grid);
-extern int_t pzgstrf(superlu_dist_options_t *, int, int, double,
+extern int_t pzgstrf(superlu_dist_options_t *, int, int, double anorm,
 		    zLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
 
 /* #define GPU_PROF
@@ -534,12 +538,13 @@ extern void zCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
 extern void zZero_CompRowLoc_Matrix_dist(SuperMatrix *);
 extern void zScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, doublecomplex);
 extern void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, doublecomplex);
-extern void zZeroLblocks(int, int_t, gridinfo_t *, zLUstruct_t *);
+extern void zZeroLblocks(int, int, gridinfo_t *, zLUstruct_t *);
+extern void zZeroUblocks(int iam, int n, gridinfo_t *, zLUstruct_t *);
 extern void    zfill_dist (doublecomplex *, int_t, doublecomplex);
 extern void    zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t,
                                      doublecomplex*, int_t, gridinfo_t*);
 extern void    pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t,
-				doublecomplex [], int_t , gridinfo_t *);
+				doublecomplex [], int_t , MPI_Comm);
 extern void  zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
 			   doublecomplex **, int_t **, int_t **);
 extern void  zreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
@@ -582,14 +587,12 @@ extern void zgemm_(const char*, const char*, const int*, const int*, const int*,
                   const int*, const doublecomplex*, doublecomplex*, const int*, int, int);
 extern void ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
                   doublecomplex*, int*, int, int, int);
-extern void ztrsm_(char*, char*, char*, char*, int*, int*,
-                  doublecomplex*, doublecomplex*, int*, doublecomplex*,
-                  int*, int, int, int, int);
-extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *,
-                  doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int);
-
-extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
-                 doublecomplex*, int*, doublecomplex*, int*);
+extern void ztrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const doublecomplex*, const doublecomplex*, const int*,
+		  doublecomplex*, const int*, int, int, int, int);
+extern void zgemv_(const char *, const int *, const int *, const doublecomplex *,
+                  const doublecomplex *a, const int *, const doublecomplex *, const int *,
+		  const doublecomplex *, doublecomplex *, const int *, int);
 
 #else
 extern int zgemm_(const char*, const char*, const int*, const int*, const int*,
@@ -597,24 +600,62 @@ extern int zgemm_(const char*, const char*, const int*, const int*, const int*,
                    const int*,  const doublecomplex*, doublecomplex*, const int*);
 extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*,
                   doublecomplex*, int*);
-extern int ztrsm_(char*, char*, char*, char*, int*, int*,
-                  doublecomplex*, doublecomplex*, int*, doublecomplex*, int*);
-extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *,
-                  doublecomplex *, int *, doublecomplex *, doublecomplex *, int *);
-extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
-                 doublecomplex*, int*, doublecomplex*, int*);
-
+extern int ztrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const doublecomplex*, const doublecomplex*, const int*,
+		  doublecomplex*, const int*);
+extern void zgemv_(const char *, const int *, const int *, const doublecomplex *,
+                  const doublecomplex *a, const int *, const doublecomplex *, const int *,
+		  const doublecomplex *, doublecomplex *, const int *);
 #endif
 
-extern int zscal_(int *n, doublecomplex *da, doublecomplex *dx, int *incx);
-extern int zaxpy_(int *n, doublecomplex *za, doublecomplex *zx, 
-	               int *incx, doublecomplex *zy, int *incy);
+extern void zgeru_(const int*, const int*, const doublecomplex*,
+                 const doublecomplex*, const int*, const doublecomplex*, const int*,
+		 doublecomplex*, const int*);
+
+extern int zscal_(const int *n, const doublecomplex *alpha, doublecomplex *dx, const int *incx);
+extern int zaxpy_(const int *n, const doublecomplex *alpha, const doublecomplex *x, 
+	               const int *incx, doublecomplex *y, const int *incy);
+
+/* SuperLU BLAS interface: zsuperlu_blas.c  */
+extern int superlu_zgemm(const char *transa, const char *transb,
+                  int m, int n, int k, doublecomplex alpha, doublecomplex *a,
+                  int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc);
+extern int superlu_ztrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag, const int m, const int n,
+                  const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, doublecomplex *b, const int ldb);
+extern int superlu_zger(const int m, const int n, const doublecomplex alpha,
+                 const doublecomplex *x, const int incx, const doublecomplex *y,
+                 const int incy, doublecomplex *a, const int lda);
+extern int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx);
+extern int superlu_zaxpy(const int n, const doublecomplex alpha,
+    const doublecomplex *x, const int incx, doublecomplex *y, const int incy);
+extern int superlu_zgemv(const char *trans, const int m,
+                  const int n, const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, const doublecomplex *x, const int incx,
+                  const doublecomplex beta, doublecomplex *y, const int incy);
+extern int superlu_ztrsv(char *uplo, char *trans, char *diag,
+                  int n, doublecomplex *a, int lda, doublecomplex *x, int incx);
+
+#ifdef SLU_HAVE_LAPACK
 // LAPACK routine
 extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*);
+#endif
 
-
-#if 0
 /*==== For 3D code ====*/
+extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                     int *ldb, doublecomplex **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d);
+extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                           int *ldb, doublecomplex **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+    
+/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+   it to a NRformat_loc distributed in 2D grid in grid-0 */
+extern void zGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, doublecomplex *B,
+				   int ldb, int nrhs, gridinfo3d_t *grid3d,
+				   NRformat_loc3d **);
+extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 		       zScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs,
@@ -630,13 +671,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
     /* from scatter.h */
 extern void
 zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info, doublecomplex *L_mat, int_t ldl,
-                    doublecomplex *U_mat, int_t ldu,  doublecomplex *bigV,
+                    Remain_info_t *Remain_info, doublecomplex *L_mat, int ldl,
+                    doublecomplex *U_mat, int ldu,  doublecomplex *bigV,
                     // int_t jj0,
                     int_t knsupc,  int_t klst,
                     int_t *lsub, int_t *usub, int_t ldt,
                     int_t thread_id,
-                    int_t *indirect, int_t *indirect2,
+                    int *indirect, int *indirect2,
                     int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
                     int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
                     int_t *xsup, gridinfo_t *, SuperLUStat_t *
@@ -644,6 +685,8 @@ zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
                     , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                   );
+
+#ifdef _OPENMP
 /*this version uses a lock to prevent multiple thread updating the same block*/
 extern void
 zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
@@ -654,7 +697,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          int_t knsupc,  int_t klst,
                          int_t *lsub, int_t *usub, int_t ldt,
                          int_t thread_id,
-                         int_t *indirect, int_t *indirect2,
+                         int *indirect, int *indirect2,
                          int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
                          int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
                          int_t *xsup, gridinfo_t *
@@ -662,11 +705,13 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                        );
+#endif
+
 extern int_t
 zblock_gemm_scatterTopLeft( int_t lb,  int_t j, doublecomplex* bigV,
 				 int_t knsupc,  int_t klst, int_t* lsub,
                                  int_t * usub, int_t ldt,
-				 int_t* indirect, int_t* indirect2,
+				 int* indirect, int* indirect2,
                                  HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                  SCT_t*SCT, SuperLUStat_t *
                                );
@@ -674,21 +719,21 @@ extern int_t
 zblock_gemm_scatterTopRight( int_t lb,  int_t j, doublecomplex* bigV,
 				  int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,
-				  int_t* indirect, int_t* indirect2,
+				  int* indirect, int* indirect2,
                                   HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                   SCT_t*SCT, SuperLUStat_t * );
 extern int_t
 zblock_gemm_scatterBottomLeft( int_t lb,  int_t j, doublecomplex* bigV,
 				    int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt, 
-				    int_t* indirect, int_t* indirect2,
+				    int* indirect, int* indirect2,
                                     HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                     SCT_t*SCT, SuperLUStat_t * );
 extern int_t 
 zblock_gemm_scatterBottomRight( int_t lb,  int_t j, doublecomplex* bigV,
 				     int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,
-				     int_t* indirect, int_t* indirect2,
+				     int* indirect, int* indirect2,
                                      HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                      SCT_t*SCT, SuperLUStat_t * );
 
@@ -721,10 +766,10 @@ extern void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d
 extern void z3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
 			    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
-extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-		       superlu_dist_options_t *options, zLocalLU_t *Llu,
-		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
-		       int_t * iperm_c_supno);
+//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+//		       superlu_dist_options_t *options, zLocalLU_t *Llu,
+//		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
+//		       int_t * iperm_c_supno);
 
 extern void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 				  sForest_t**  sForests, zLUstruct_t* LUstruct,
@@ -735,28 +780,6 @@ extern int_t zgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests,
 				   SCT_t* SCT );
 
     /* The following are from pdgstrf2.h */
-#if 0 // Sherry: same routine names, but different code !!!!!!!
-extern void pzgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t,
-                          int_t k, double thresh, Glu_persist_t *,
-			  gridinfo_t *, zLocalLU_t *, MPI_Request *U_diag_blk_send_req,
-			  int tag_ub, SuperLUStat_t *, int *info, SCT_t *);
-#ifdef _CRAY
-void pzgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *,
-                      zLocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-void pzgstrs2_omp (int_t, int_t, int_t, int_t *, doublecomplex*, Glu_persist_t *, gridinfo_t *,
-                      zLocalLU_t *, SuperLUStat_t *, Ublock_info_t *, doublecomplex *bigV, int_t ldt, SCT_t *SCT );
-#endif
-
-#else 
-extern void pzgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k,
-			  double thresh, Glu_persist_t *, gridinfo_t *,
-			  zLocalLU_t *, MPI_Request *, int tag_ub,
-			  SuperLUStat_t *, int *info);
-extern void pzgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *,
-			 zLocalLU_t *, Ublock_info_t *, SuperLUStat_t *);
-#endif // same routine names   !!!!!!!!
-
 extern int_t zLpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr,
 			  int_t ld_ujrow, doublecomplex* lusup, int_t nsupr, SCT_t*);
 extern void Local_Zgstrf2(superlu_dist_options_t *options, int_t k,
@@ -771,7 +794,7 @@ extern int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 			   doublecomplex* uval, doublecomplex *tempv);
 extern int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
                              int_t *usub, doublecomplex* uval, doublecomplex *tempv,
-                             int_t knsupc, int_t nsupr, doublecomplex* lusup,
+                             int_t knsupc, int nsupr, doublecomplex* lusup,
                              Glu_persist_t *Glu_persist)  ;
 extern void pzgstrs2
 #ifdef _CRAY
@@ -802,9 +825,9 @@ extern int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruc
 extern int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d);
 /*zero out LU non zero entries*/
 extern int_t zzeroSetLU(int_t nnodes, int_t* nodeList , zLUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(zLUstruct_t *);
+extern int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *);
+extern int zDeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*);
+extern int zDeAllocGlu_3d(zLUstruct_t *);
 
 /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
@@ -815,7 +838,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver,
                         doublecomplex* Lval_buf, doublecomplex* Uval_buf,
                         zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
 /*reduces all nodelists required in a level*/
-int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
+extern int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
                            int_t** treePerm,
                            zLUValSubBuf_t* LUvsb,
                            zLUstruct_t* LUstruct,
@@ -855,21 +878,21 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha,
     /* from communication_aux.h */
 extern int_t zIBcast_LPanel (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup,
 			     gridinfo_t *, int* msgcnt, MPI_Request *,
-			     int_t **ToSendR, int_t *xsup, int );
+			     int **ToSendR, int_t *xsup, int );
 extern int_t zBcast_LPanel(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup,
-			   gridinfo_t *, int* msgcnt, int_t **ToSendR,
+			   gridinfo_t *, int* msgcnt, int **ToSendR,
 			   int_t *xsup , SCT_t*, int);
 extern int_t zIBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval,
 			    gridinfo_t *, int* msgcnt, MPI_Request *,
-			    int_t *ToSendD, int );
+			    int *ToSendD, int );
 extern int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval,
-			   gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int);
+			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
 extern int_t zIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
 			    doublecomplex* Lval_buf, gridinfo_t *,
 			    MPI_Request *, zLocalLU_t *, int);
 extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*,
 			   zLocalLU_t *, gridinfo_t*, MPI_Request *, int);
-extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR,
+extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
 			MPI_Request *s, SCT_t*);
 extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
 extern int_t zWait_URecv(MPI_Request *, int* msgcnt, SCT_t *);
@@ -942,7 +965,7 @@ extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFac
 			    gridinfo_t *, zLUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, zLUstruct_t *);
+extern int getNsupers(int, Glu_persist_t *);
 extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
 extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
 extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
@@ -953,33 +976,29 @@ extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
 				   doublecomplex* Uval_buf, gridinfo_t *, zLUstruct_t *);
 extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
-				      factNodelists_t*, scuBufs_t*,
+				      factNodelists_t*, zscuBufs_t*,
 				      zLUValSubBuf_t* LUvsb, gridinfo_t *,
 				      zLUstruct_t *, HyP_t*);
 extern doublecomplex* zgetBigV(int_t, int_t);
 extern doublecomplex* zgetBigU(int_t, gridinfo_t *, zLUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, zLUstruct_t *);
 // permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      zLUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, zLUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
 extern int_t zLluBufInit(zLUValSubBuf_t*, zLUstruct_t *);
 extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
-			  scuBufs_t*, zLUstruct_t*, gridinfo_t *);
-extern int zfreeScuBufs(scuBufs_t* scuBufs);
+			  zscuBufs_t*, zLUstruct_t*, gridinfo_t *);
+extern int zfreeScuBufs(zscuBufs_t* scuBufs);
 
 // the generic tree factoring code 
 extern int_t treeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    zscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     zLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    zdiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -995,11 +1014,11 @@ extern int_t zsparseTreeFactor(
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     treeTopoInfo_t* treeTopoInfo,
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    zscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     zLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    zdiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1014,11 +1033,11 @@ extern int_t zdenseTreeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    zscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     zLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    zdiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1032,11 +1051,11 @@ extern int_t zdenseTreeFactor(
 extern int_t zsparseTreeFactor_ASYNC(
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    zscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    zdiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1050,9 +1069,9 @@ extern int_t zsparseTreeFactor_ASYNC(
 );
 extern zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct);
 extern int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs);
-extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
-extern int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs);
-extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
+extern zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
+extern int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs);
+extern int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf);
 extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
 			    gridinfo_t *grid, SCT_t *SCT);
 extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
@@ -1062,11 +1081,11 @@ extern int_t ancestorFactor(
     int_t ilvl,             // level of factorization 
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    zscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    zdiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1078,8 +1097,8 @@ extern int_t ancestorFactor(
     double thresh,  SCT_t *SCT, int tag_ub, int *info
 );
 
-/*=====================*/
-#endif  // end 3D prototypes
+/*== end 3D prototypes ===================*/
+
 
 #ifdef __cplusplus
   }
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 1c296530..1d720355 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -188,4 +188,33 @@ typedef struct {
 } NRformat_loc;
 
 
+/* Data structure for storing 3D matrix on layer 0 of the 2D process grid
+   Only grid-0 has meanful values of these data structures.   */
+typedef struct NRformat_loc3d
+{
+    NRformat_loc *A_nfmt; // Gathered A matrix on 2D grid-0 
+    void *B3d;  // on the entire 3D process grid
+    int  ldb;   // relative to 3D process grid
+    int nrhs;
+    int m_loc;  // relative to 3D process grid
+    void *B2d;  // on 2D process layer grid-0
+
+    int *row_counts_int; // these counts are stored on 2D layer grid-0,
+    int *row_disp;       // but count the number of {A, B} rows along Z-dimension
+    int *nnz_counts_int; 
+    int *nnz_disp;
+    int *b_counts_int;
+    int *b_disp;
+
+    /* The following 4 structures are used for scattering
+       solution X from 2D grid-0 back to 3D processes */
+    int num_procs_to_send;  
+    int *procs_to_send_list;
+    int *send_count_list;
+    int num_procs_to_recv;
+    int *procs_recv_from_list;
+    int *recv_count_list;
+} NRformat_loc3d;
+
+
 #endif  /* __SUPERLU_SUPERMATRIX */
diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c
new file mode 100644
index 00000000..dc6144fa
--- /dev/null
+++ b/SRC/supernodalForest.c
@@ -0,0 +1,971 @@
+/*! @file
+ * \brief SuperLU utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+#include <stdio.h>
+#include <assert.h>
+#include "superlu_ddefs.h"
+#if 0
+#include "sec_structs.h"
+#include "supernodal_etree.h"
+#include "load-balance/supernodalForest.h"
+#include "p3dcomm.h"
+#endif
+#include <math.h>
+
+#define INT_T_ALLOC(x)  ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
+#define DOUBLE_ALLOC(x)  ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
+
+
+int_t calcTopInfoForest(sForest_t *forest,
+                        int_t nsupers, int_t* setree);
+
+
+sForest_t**  getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList)
+{
+	// treePartStrat tps;
+	if (getenv("LBS"))
+	{
+		if (strcmp(getenv("LBS"), "ND" ) == 0)
+		{
+			return getNestDissForests( maxLvl, nsupers, setree, treeList);
+		}
+		if (strcmp(getenv("LBS"), "GD" ) == 0)
+		{
+			return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList);
+		}
+	}
+	else
+	{
+		return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList);
+	}
+	return 0;
+}
+
+double calcNodeListWeight(int_t nnodes, int_t* nodeList, treeList_t* treeList)
+{
+	double trWeight = 0;
+
+	for (int i = 0; i < nnodes; ++i)
+	{
+	    trWeight += treeList[nodeList[i]].weight;
+	}
+
+	return trWeight;
+}
+
+sForest_t**  getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList)
+{
+
+	int_t numForests = (1 << maxLvl) - 1;
+
+	// allocate space for forests
+	sForest_t**  sForests = SUPERLU_MALLOC (numForests * sizeof (sForest_t*));
+
+
+	int_t* gTreeHeads = getTreeHeads(maxLvl, nsupers, treeList);
+
+	int_t* gNodeCount = calcNumNodes(maxLvl, gTreeHeads, treeList);
+	int_t** gNodeLists = getNodeList(maxLvl, setree, gNodeCount,
+	                                 gTreeHeads,  treeList);
+
+	SUPERLU_FREE(gTreeHeads); // Sherry added
+
+	for (int i = 0; i < numForests; ++i)
+	{
+		sForests[i] = NULL;
+		if (gNodeCount[i] > 0)
+		{
+			sForests[i] = SUPERLU_MALLOC (sizeof (sForest_t));
+			sForests[i]->nNodes = gNodeCount[i];
+			sForests[i]->numTrees = 1;
+			sForests[i]->nodeList = gNodeLists[i];
+			sForests[i]->weight = calcNodeListWeight(sForests[i]->nNodes, sForests[i]->nodeList, treeList);
+
+			calcTopInfoForest(sForests[i],  nsupers, setree);
+		}
+	}
+
+	return sForests;
+}
+
+static int_t* sortPtr;
+
+static  int cmpfuncInd (const void * a, const void * b)
+{
+	return ( sortPtr[*(int_t*)a] - sortPtr[*(int_t*)b] );
+}
+// doesn't sort A but gives the index of sorted array
+int_t* getSortIndex(int_t n, int_t* A)
+{
+	int_t* idx = INT_T_ALLOC(n);
+
+	for (int i = 0; i < n; ++i)
+	{
+		/* code */
+		idx[i] = i;
+	}
+	sortPtr = A;
+
+	qsort(idx, n, sizeof(int_t), cmpfuncInd);
+
+	return idx;
+}
+
+
+static double* sortPtrDouble;
+
+static  int cmpfuncIndDouble (const void * a, const void * b)
+{
+    return ( sortPtrDouble[*(int_t*)a] > sortPtrDouble[*(int_t*)b] );
+}
+// doesn't sort A but gives the index of sorted array
+int_t* getSortIndexDouble(int_t n, double* A)
+{
+	int_t* idx = INT_T_ALLOC(n);
+
+	for (int i = 0; i < n; ++i)
+	{
+	    /* code */
+	    idx[i] = i;
+	}
+	sortPtrDouble = A;
+
+	qsort(idx, n, sizeof(int_t), cmpfuncIndDouble);
+
+	return idx;
+}
+
+static int cmpfunc(const void * a, const void * b)
+{
+	return ( *(int_t*)a - * (int_t*)b );
+}
+
+
+int_t* permuteArr(int_t n, int_t* A, int_t* perm)
+{
+	int_t* permA = INT_T_ALLOC(n);
+
+	for (int i = 0; i < n; ++i)
+	{
+		/* code */
+		permA[i] = A[perm[i]];
+	}
+
+	return permA;
+}
+
+
+
+int_t calcTopInfoForest(sForest_t *forest,
+                        int_t nsupers, int_t* setree)
+{
+
+	int_t nnodes = forest->nNodes;
+	int_t* nodeList = forest->nodeList;
+
+	qsort(nodeList, nnodes, sizeof(int_t), cmpfunc);
+	int_t* myIperm = getMyIperm(nnodes, nsupers, nodeList);
+	int_t* myTopOrderOld = getMyTopOrder(nnodes, nodeList, myIperm, setree );
+	int_t* myTopSortIdx = getSortIndex(nnodes, myTopOrderOld);
+	int_t* nodeListNew = permuteArr(nnodes, nodeList, myTopSortIdx);
+	int_t* myTopOrder = permuteArr(nnodes, myTopOrderOld, myTopSortIdx);
+
+	SUPERLU_FREE(nodeList);
+	SUPERLU_FREE(myTopSortIdx);
+	SUPERLU_FREE(myIperm);
+	SUPERLU_FREE(myTopOrderOld);
+	myIperm = getMyIperm(nnodes, nsupers, nodeListNew);
+
+	treeTopoInfo_t ttI;
+	ttI.myIperm = myIperm;
+	ttI.numLvl = myTopOrder[nnodes - 1] + 1;
+	ttI.eTreeTopLims = getMyEtLims(nnodes, myTopOrder);
+
+	forest->nodeList = nodeListNew;
+	forest->topoInfo = ttI;
+
+	SUPERLU_FREE(myTopOrder); // sherry added
+
+	return 0;
+}
+
+// #pragma optimize ("", off)
+
+double* getTreeWeights(int_t numTrees, int_t* gNodeCount, int_t** gNodeLists, treeList_t* treeList)
+{
+	double* gTreeWeights = DOUBLE_ALLOC(numTrees);
+
+	// initialize with weight with whole subtree weights
+	for (int_t i = 0; i < numTrees; ++i)
+	{
+	    gTreeWeights[i] = calcNodeListWeight(gNodeCount[i], gNodeLists[i], treeList);
+	}
+
+	return gTreeWeights;
+
+}
+
+int_t*  getNodeCountsFr(int_t maxLvl, sForest_t**  sForests)
+{
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    /* code */
+	    if (sForests[i])
+		{gNodeCount[i] = sForests[i]->nNodes;}
+	    else
+		{
+		    gNodeCount[i] = 0;
+		}
+	}
+	return gNodeCount;
+}
+
+int_t** getNodeListFr(int_t maxLvl, sForest_t**  sForests)
+{
+	int_t numForests = (1 << maxLvl) - 1;
+	int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+	for (int i = 0; i < numForests; ++i)
+	{
+		/* code */
+		if (sForests[i])
+		{
+			gNodeLists[i] = sForests[i]->nodeList;
+		}
+		else
+		{
+			gNodeLists[i] = NULL;
+		}
+	}
+
+	return  gNodeLists;
+}
+
+int_t* getNodeToForstMap(int_t nsupers, sForest_t**  sForests, gridinfo3d_t* grid3d)
+{
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t numForests = (1 << maxLvl) - 1;
+	int_t* gNodeToForstMap = INT_T_ALLOC (nsupers);
+	
+	for (int i = 0; i < numForests; ++i)
+	{
+		/* code */
+		if (sForests[i])
+		{	int_t nnodes = sForests[i]->nNodes;
+			int_t* nodeList = sForests[i]->nodeList;
+			for(int_t node = 0; node<nnodes; node++)
+			{
+				gNodeToForstMap[nodeList[node]]	=i;
+			}
+		}
+	}
+
+	return gNodeToForstMap;
+
+}
+
+int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, sForest_t**  sForests)
+{
+	int_t* myNodeCount = INT_T_ALLOC(maxLvl);
+	for (int i = 0; i < maxLvl; ++i)
+	{
+		myNodeCount[i] = 0;
+		if (sForests[myTreeIdxs[i]])
+			myNodeCount[i] = sForests[myTreeIdxs[i]]->nNodes;
+	}
+
+	return myNodeCount;
+}
+
+
+int_t** getTreePermFr( int_t* myTreeIdxs,
+                       sForest_t**  sForests, gridinfo3d_t* grid3d)
+{
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+	int_t** treePerm = (int_t** ) SUPERLU_MALLOC(sizeof(int_t*)*maxLvl);
+	for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+		treePerm[lvl] = NULL;
+		if (sForests[myTreeIdxs[lvl]])
+			treePerm[lvl] = sForests[myTreeIdxs[lvl]]->nodeList;
+	}
+	return treePerm;
+}
+
+int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm)
+{
+    int* isNodeInMyGrid = SUPERLU_MALLOC(nsupers * sizeof(int));
+
+    for(int i=0; i<nsupers; i++) isNodeInMyGrid[i] =0;
+	
+    for (int i = 0; i < maxLvl; ++i)
+    {
+	for(int node = 0; node< myNodeCount[i]; node++ )
+	{
+	    isNodeInMyGrid[treePerm[i][node]]=1;
+	}
+    }
+
+    return isNodeInMyGrid;
+}
+
+double pearsonCoeff(int_t numForests, double* frCost, double* frWeight)
+{
+	if (numForests == 1)
+	{
+		return 1.0;
+	}
+	double meanCost = 0;
+	double meanWeight = 0;
+	for (int i = 0; i < numForests; ++i)
+	{
+		meanWeight += frWeight[i] / numForests;
+		meanCost += frCost[i] / numForests;
+	}
+
+	double stdCost = 0;
+	double stdWeight = 0;
+	double covarCostWeight = 0;
+	for (int i = 0; i < numForests; ++i)
+	{
+		/* code */
+		stdCost += (frCost[i] - meanCost) * (frCost[i] - meanCost);
+		stdWeight += (frWeight[i] - meanWeight) * (frWeight[i] - meanWeight);
+		covarCostWeight += (frCost[i] - meanCost) * (frWeight[i] - meanWeight);
+	}
+
+	return covarCostWeight / sqrt(stdCost * stdWeight);
+
+}
+void printGantt(int root, int numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost);
+
+void printForestWeightCost(sForest_t**  sForests, SCT_t* SCT, gridinfo3d_t* grid3d)
+{
+	gridinfo_t* grid = &(grid3d->grid2d);
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t numForests = (1 << maxLvl) - 1;
+	double* gFrstCost = DOUBLE_ALLOC(numForests);
+	double* gFrstCostAcc = DOUBLE_ALLOC(numForests);
+	double* gFrstWt = DOUBLE_ALLOC(numForests);
+
+	for (int i = 0; i < numForests; ++i)
+	{
+		gFrstCost[i] = 0;
+		gFrstWt[i] = 0;
+		if (sForests[i])
+		{
+			gFrstCost[i] = sForests[i]->cost;
+			gFrstWt[i] = sForests[i]->weight;
+		}
+	}
+
+	// reduce forest costs from all the grid;
+	MPI_Reduce(gFrstCost, gFrstCostAcc, numForests, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm);
+
+	if (!grid3d->zscp.Iam  && !grid->iam)
+	{
+	    printf("|Forest | weight | cost | weight/Cost | \n");
+	    for (int i = 0; i < numForests; ++i)
+		{
+		    /* code */
+		    double wt, ct;
+		    wt = 0.0;
+		    ct = 0.0;
+		    if (sForests[i])
+			{
+			    wt = sForests[i]->weight;
+			}
+		    printf("|%d   | %.2e   | %.2e   | %.2e  |\n", i, wt, gFrstCostAcc[i], 1e-9 * wt / gFrstCostAcc[i] );
+
+		}
+
+		double* crPathCost = DOUBLE_ALLOC(numForests);
+		double* crPathWeight = DOUBLE_ALLOC(numForests);
+		// print the critcal path
+		for (int i = numForests - 1; i > -1 ; --i)
+		{
+		    crPathCost[i] = gFrstCostAcc[i];
+		    crPathWeight[i] = gFrstWt[i];
+
+		    if (2 * i + 1 < numForests)
+			{
+
+			    if (crPathCost[2 * i + 1] > crPathCost[2 * i + 2])
+				{
+				    /* code */
+				    crPathCost[i] += crPathCost[2 * i + 1];
+				    crPathWeight[i] += crPathWeight[2 * i + 1];
+				}
+			    else
+				{
+				    crPathCost[i] += crPathCost[2 * i + 2];
+				    crPathWeight[i] += crPathWeight[2 * i + 2];
+				}
+			}
+		}
+
+
+		printf("|CritcalPath   | %.2e   | %.2e   | %.2e  |\n", crPathWeight[0], crPathCost[0], 1e-9 * crPathWeight[0] / crPathCost[0] );
+
+		double prsnCoeff = pearsonCoeff(numForests, gFrstCost, gFrstWt);
+		printf("|Pearsoncoefficient |  %.3f |\n", prsnCoeff);
+
+		printf("\n~~~mermaid \n");
+		printf("\ngantt \n \
+       \t\t dateFormat  mm-ss \n\
+       \t\t title TreeCost and Time Gantt Chart\n\n\n" );
+		printf("\t Section Time\n");
+		printGantt(0,  numForests, "Time", 1.0 , gFrstCostAcc, crPathCost);
+		printf("\t Section Weight\n");
+		printGantt(0,  numForests, "weight", crPathCost[0]/crPathWeight[0] , gFrstWt, crPathWeight);
+
+		printf("~~~\n\n\n");
+		SUPERLU_FREE(crPathCost);
+		SUPERLU_FREE(crPathWeight);
+
+	}
+
+	SUPERLU_FREE( gFrstCost);
+	SUPERLU_FREE( gFrstCostAcc);
+	SUPERLU_FREE( gFrstWt);
+}
+
+
+void printGantt(int root, int numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost)
+{
+	
+
+	if (2*root+1>=numForests)
+	{
+		/* if there are no more childrens*/
+		printf("\t tree-%d \t:%s-%d, 0d, %.0fd \n", root,nodename, root,  100*scale*gFrstCostAcc[root]  );
+	}
+	else
+	{
+	  printGantt(2*root+1,  numForests, nodename,  scale,  gFrstCostAcc, crPathCost);
+	  int depTree =crPathCost[2*root+1]> crPathCost[2*root+2]? 2*root+1:2*root+2;
+	  printf("\t tree-%d  %.2g \t:%s-%d, after %s-%d, %.0fd \n", root,100*scale*crPathCost[root], nodename, root, nodename, depTree, 100*scale*gFrstCostAcc[root]  );
+	  printGantt(2*root+2,  numForests,  nodename, scale,  gFrstCostAcc, crPathCost);	
+	}
+}
+
+#define ABS(a) ((a)<0?-(a):a)
+double getLoadImbalance(int_t nTrees,
+                        int_t * treeIndx,				// index of tree in gtrees
+                        double * gTreeWeights)
+{
+
+	if (nTrees < 1)
+	{
+		/* code */
+		return 0;
+	}
+	double w1 = 0;
+	double w2 = 0;
+
+	int_t* wSortIdx = getSortIndexDouble(nTrees, gTreeWeights);
+	// can not change weight array
+	w1 = gTreeWeights[wSortIdx[nTrees - 1]];
+
+
+	for (int i = nTrees - 2 ; i > -1; --i)
+	{
+		/* code */
+		if (w1 > w2)
+		{
+			/* code */
+			w2 += gTreeWeights[wSortIdx[i]];
+
+		}
+		else
+		{
+			w1 += gTreeWeights[wSortIdx[i]];
+
+		}
+	}
+
+	SUPERLU_FREE(wSortIdx);
+	return ABS(w2 - w1) / (w2 + w1);
+	// return trPart;
+
+}
+
+
+// maximum allowed imbalance
+#define ACCEPTABLE_TREE_IMBALANCE 0.2
+
+
+// r forest contains a list of tree heads
+// each treehead is an entire subtree (all level beloe)
+#define MAX_TREE_ALLOWED 1024
+
+typedef struct
+{
+	int_t ntrees;
+	int_t* treeHeads;
+} rForest_t;
+
+typedef struct
+{
+	sForest_t* Ans;
+	rForest_t* S[2];
+} forestPartition_t;
+
+void freeRforest(rForest_t* rforest)
+{
+	SUPERLU_FREE(rforest->treeHeads);
+}
+
+
+sForest_t*  createForestNew(int_t numTrees, int_t nsupers, int_t * nodeCounts,  int_t** NodeLists, int_t * setree, treeList_t* treeList)
+{
+	if (numTrees == 0) return NULL;
+
+	sForest_t* forest = SUPERLU_MALLOC(sizeof(sForest_t));
+	forest->numTrees = numTrees;
+
+	double frWeight = 0;
+	int_t nodecount = 0;
+	for (int_t i = 0; i < numTrees; ++i)
+	{
+	    nodecount += nodeCounts[i];
+	    frWeight += calcNodeListWeight(nodeCounts[i], NodeLists[i], treeList);
+	}
+
+	forest->nNodes = nodecount;
+	forest->weight = frWeight;
+
+	int_t* nodeList = INT_T_ALLOC(forest->nNodes);
+
+	int_t ptr = 0;
+	for (int_t i = 0; i < numTrees; ++i)
+	{
+	    for (int_t j = 0; j < nodeCounts[i]; ++j)
+		{
+		    /* copy the loop */
+		    nodeList[ptr] = NodeLists[i][j];
+		    ptr++;
+		}
+	}
+
+	forest->nodeList = nodeList;
+	forest->cost = 0.0;
+
+
+	// using the nodelist create factorization ordering
+	calcTopInfoForest(forest, nsupers, setree);
+
+	return forest;
+}
+
+void oneLeveltreeFrPartition( int_t nTrees, int_t * trCount, int_t** trList,
+                              int_t * treeSet,
+                              double * sWeightArr)
+{
+	if (nTrees < 1)
+	{
+		/* code */
+		trCount[0] = 0;
+		trCount[1] = 0;
+		return;
+	}
+	double w1 = 0;
+	double w2 = 0;
+
+	int_t* wSortIdx = getSortIndexDouble(nTrees, sWeightArr);
+	// treeIndx= permuteArr(nTrees, treeIndx, wSortIdx);
+
+	int_t S1ptr = 0;
+	int_t S2ptr = 0;
+
+	// can not change weight array
+	w1 = sWeightArr[wSortIdx[nTrees - 1]];
+	trList[0][S1ptr++] = treeSet[wSortIdx[nTrees - 1]];
+
+	for (int i = nTrees - 2 ; i > -1; --i)
+	{
+		/* code */
+		if (w1 > w2)
+		{
+			/* code */
+			w2 += sWeightArr[wSortIdx[i]];
+			trList[1][S2ptr++] = treeSet[wSortIdx[i]];
+		}
+		else
+		{
+			w1 += sWeightArr[wSortIdx[i]];
+			trList[0][S1ptr++] = treeSet[wSortIdx[i]];
+		}
+	}
+
+	trCount[0] = S1ptr;
+	trCount[1] = S2ptr;
+
+	SUPERLU_FREE(wSortIdx);
+
+} /* oneLeveltreeFrPartition */
+
+forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList)
+{
+
+    int_t nTreeSet = rforest->ntrees;
+    int_t* treeHeads =  rforest->treeHeads;
+
+    int_t nAnc = 0;
+#if 0
+    int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED);
+    int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*));
+
+    double * weightArr = DOUBLE_ALLOC (MAX_TREE_ALLOWED);
+    // int_t* treeSet = INT_T_ALLOC(nTreeSet);
+    int_t* treeSet = INT_T_ALLOC(MAX_TREE_ALLOWED);
+#else  // Sherry fix
+    int_t* ancTreeCount = intMalloc_dist(MAX_TREE_ALLOWED);
+    int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*));
+
+    double * weightArr = doubleMalloc_dist(MAX_TREE_ALLOWED);
+    int_t* treeSet = intMalloc_dist(MAX_TREE_ALLOWED);
+#endif
+
+	for (int i = 0; i < nTreeSet; ++i)
+	{
+		treeSet[i] = treeHeads[i];
+		weightArr[i] = treeList[treeHeads[i]].iWeight;
+	}
+
+	while (getLoadImbalance(nTreeSet, treeSet, weightArr) > ACCEPTABLE_TREE_IMBALANCE )
+	{
+		// get index of maximum weight subtree
+		int_t idx = 0;
+		for (int i = 0; i < nTreeSet; ++i)
+		{
+			/* code */
+			if (treeList[treeSet[i]].iWeight > treeList[treeSet[idx]].iWeight)
+			{
+				/* code */
+				idx = i;
+			}
+		}
+
+
+		int_t MaxTree = treeSet[idx];
+		int_t*  sroots = getSubTreeRoots(MaxTree, treeList);
+		if (sroots[0] == -1)
+		{
+			/* code */
+			SUPERLU_FREE(sroots);
+			break;
+		}
+
+		ancTreeCount[nAnc] = getCommonAncsCount(MaxTree, treeList);
+		//int_t * alist = INT_T_ALLOC (ancTreeCount[nAnc]);
+		int_t * alist = intMalloc_dist(ancTreeCount[nAnc]);
+		getCommonAncestorList(MaxTree, alist, setree, treeList);
+		ancNodeLists[nAnc] = alist;
+		nAnc++;
+
+
+		treeSet[idx] = treeSet[nTreeSet - 1];
+		weightArr[idx] = treeList[treeSet[idx]].iWeight;
+		treeSet[nTreeSet - 1] = sroots[0];
+		weightArr[nTreeSet - 1] = treeList[treeSet[nTreeSet - 1]].iWeight;
+		treeSet[nTreeSet] = sroots[1];
+		weightArr[nTreeSet] = treeList[treeSet[nTreeSet]].iWeight;
+		nTreeSet += 1;
+
+		SUPERLU_FREE(sroots);
+
+		if (nTreeSet == MAX_TREE_ALLOWED)
+		{
+			break;
+		}
+	}
+
+	// Create the Ancestor forest
+	sForest_t* aforest = createForestNew(nAnc, nsupers, ancTreeCount, ancNodeLists, setree, treeList);
+
+	// create the weight array;
+	//double* sWeightArr = DOUBLE_ALLOC(nTreeSet);
+	double* sWeightArr = doubleMalloc_dist(nTreeSet); // Sherry fix
+	for (int i = 0; i < nTreeSet ; ++i)
+		sWeightArr[i] = treeList[treeSet[i]].iWeight;
+
+	int_t trCount[2] = {0, 0};
+	int_t* trList[2];
+#if 0
+	trList[0] = INT_T_ALLOC(nTreeSet);
+	trList[1] = INT_T_ALLOC(nTreeSet);
+#else  // Sherry fix
+	trList[0] = intMalloc_dist(nTreeSet);
+	trList[1] = intMalloc_dist(nTreeSet);
+#endif
+
+	oneLeveltreeFrPartition( nTreeSet, trCount, trList,
+	                         treeSet,
+	                         sWeightArr);
+
+	rForest_t *rforestS1, *rforestS2;
+#if 0
+	rforestS1 = SUPERLU_MALLOC(sizeof(rforest));
+	rforestS2 = SUPERLU_MALLOC(sizeof(rforest));
+#else
+	rforestS1 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t));  // Sherry fix
+	rforestS2 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t));
+#endif
+
+	rforestS1->ntrees = trCount[0];
+	rforestS1->treeHeads = trList[0];
+
+	rforestS2->ntrees = trCount[1];
+	rforestS2->treeHeads = trList[1];
+
+	forestPartition_t frPr_t;
+	frPr_t.Ans 	= aforest;
+	frPr_t.S[0]     = rforestS1;
+	frPr_t.S[1]	= rforestS2;
+
+	SUPERLU_FREE(weightArr);
+	SUPERLU_FREE(treeSet);
+	SUPERLU_FREE(sWeightArr);
+
+	// free stuff
+	// 	int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED);
+	// int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*));
+
+	for (int i = 0; i < nAnc ; ++i)
+	{
+		/* code */
+		SUPERLU_FREE(ancNodeLists[i]);
+	}
+
+	SUPERLU_FREE(ancTreeCount);
+	SUPERLU_FREE(ancNodeLists);
+
+	return frPr_t;
+} /* iterativeFrPartitioning */
+
+
+/* Create a single sforest */
+sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList)
+{
+	int_t nTree = rforest->ntrees;
+
+	// quick return
+	if (nTree < 1) return NULL;
+
+	int_t* treeHeads =  rforest->treeHeads;
+	int_t* nodeCounts = INT_T_ALLOC(nTree);
+	int_t** NodeLists = SUPERLU_MALLOC(nTree * sizeof(int_t*));
+
+	for (int i = 0; i < nTree; ++i)
+	{
+	    /* code */
+	    nodeCounts[i] = treeList[treeHeads[i]].numDescendents;
+	    NodeLists[i] = INT_T_ALLOC(nodeCounts[i]);
+	    getDescendList(treeHeads[i], NodeLists[i], treeList);
+	}
+
+
+	sForest_t* sforest =  createForestNew(nTree, nsupers,  nodeCounts,  NodeLists, setree, treeList);
+
+	for (int i = 0; i < nTree; ++i)
+	{
+		/* code */
+		SUPERLU_FREE(NodeLists[i]);
+	}
+
+	SUPERLU_FREE(NodeLists);
+	SUPERLU_FREE(nodeCounts);
+
+	return sforest;
+} /* r2sForest */
+
+
+sForest_t**  getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList)
+{
+
+	// assert(maxLvl == 2);
+	int_t numForests = (1 << maxLvl) - 1;
+	sForest_t**  sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*));
+
+	int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ;
+	rForest_t*  rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t));
+
+	// intialize rfortes[0]
+	int_t nRootTrees = 0;
+
+	for (int i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		if (setree[i] == nsupers) nRootTrees++;
+
+	}
+
+	rForests[0].ntrees = nRootTrees;
+	rForests[0].treeHeads = INT_T_ALLOC(nRootTrees);
+
+	nRootTrees = 0;
+	for (int i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		if (setree[i] == nsupers)
+		{
+			rForests[0].treeHeads[nRootTrees] = i;
+			nRootTrees++;
+		}
+
+	}
+
+	if (maxLvl == 1)
+	{
+		/* code */
+		sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList);
+
+		freeRforest(&rForests[0]);  // sherry added
+		SUPERLU_FREE(rForests);
+		return sForests;
+	}
+
+	// now loop over level
+	for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl)
+	{
+		/* loop over all r forest in this level */
+		int_t lvlSt = (1 << lvl) - 1;
+		int_t lvlEnd = (1 << (lvl + 1)) - 1;
+
+		for (int_t tr = lvlSt; tr < lvlEnd; ++tr)
+		{
+		    /* code */
+		    forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList);
+		    sForests[tr] = frPr_t.Ans;
+
+		    if (lvl == maxLvl - 2) {
+			/* code */
+			sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList);
+			sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList);
+			freeRforest(frPr_t.S[0]); // Sherry added
+			freeRforest(frPr_t.S[1]);
+#if 0
+			SUPERLU_FREE(frPr_t.S[0]); // Sherry added
+			SUPERLU_FREE(frPr_t.S[1]);
+#endif
+		    } else {
+			rForests[2 * tr + 1] = *(frPr_t.S[0]);
+			rForests[2 * tr + 2] = *(frPr_t.S[1]);
+			
+		    }
+		    SUPERLU_FREE(frPr_t.S[0]); // Sherry added
+		    SUPERLU_FREE(frPr_t.S[1]);
+		}
+
+	}
+
+	for (int i = 0; i < numRForests; ++i)
+	{
+	    /* code */
+	    freeRforest(&rForests[i]);  // Sherry added
+	}
+
+	SUPERLU_FREE(rForests);  // Sherry added
+
+	return sForests;
+
+} /* getGreedyLoadBalForests */
+
+// balanced forests at one level
+sForest_t**  getOneLevelBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList)
+{
+
+	// assert(maxLvl == 2);
+	int_t numForests = (1 << maxLvl) - 1;
+	sForest_t**  sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*));
+
+	int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ;
+	rForest_t*  rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t));
+
+	// intialize rfortes[0]
+	int_t nRootTrees = 0;
+
+	for (int i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		if (setree[i] == nsupers)
+		{
+			nRootTrees += 2;
+		}
+
+	}
+
+	rForests[0].ntrees = nRootTrees;
+	rForests[0].treeHeads = INT_T_ALLOC(nRootTrees);
+
+	nRootTrees = 0;
+	for (int i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		if (setree[i] == nsupers)
+		{
+			rForests[0].treeHeads[nRootTrees] = i;
+			nRootTrees++;
+		}
+	}
+
+	if (maxLvl == 1)
+	{
+		/* code */
+		sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList);
+		return sForests;
+	}
+
+	// now loop over level
+	for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl)
+	{
+		/* loop over all r forest in this level */
+		int_t lvlSt = (1 << lvl) - 1;
+		int_t lvlEnd = (1 << (lvl + 1)) - 1;
+
+		for (int_t tr = lvlSt; tr < lvlEnd; ++tr)
+		{
+			/* code */
+			forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList);
+			sForests[tr] = frPr_t.Ans;
+
+			if (lvl == maxLvl - 2)
+			{
+				/* code */
+				sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList);
+				sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList);
+			}
+			else
+			{
+				rForests[2 * tr + 1] = *(frPr_t.S[0]);
+				rForests[2 * tr + 2] = *(frPr_t.S[1]);
+			}
+
+		}
+
+	}
+
+	for (int i = 0; i < numRForests; ++i)
+	{
+		/* code */
+		freeRforest(&rForests[i]);
+	}
+
+	SUPERLU_FREE(rForests);
+
+
+
+	return sForests;
+
+}
diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c
new file mode 100644
index 00000000..c683feaf
--- /dev/null
+++ b/SRC/supernodal_etree.c
@@ -0,0 +1,1060 @@
+/*! @file
+ * \brief function to generate supernodal etree
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include "superlu_ddefs.h"
+//#include "supernodal_etree.h"
+
+#define INT_T_ALLOC(x)  ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
+int_t log2i(int_t index)
+{
+	int_t targetlevel = 0;
+	while (index >>= 1) ++targetlevel;
+	return targetlevel;
+}
+
+/**
+ * Returns Supernodal Elimination Tree
+ * @param  nsuper Number of Supernodes
+ * @param  etree  Scalar elimination tree 
+ * @param  supno  Vertex to supernode mapping
+ * @param  xsup   Supernodal boundaries
+ * @return       Supernodal elimination tree
+ */
+int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup)
+{
+    //	int_t *setree = malloc(sizeof(int_t) * nsuper);
+    int_t *setree = intMalloc_dist(nsuper);  // Sherry fix
+	/*initialzing the loop*/
+	for (int i = 0; i < nsuper; ++i)
+	{
+	    setree[i] = nsuper;
+	}
+	/*calculating the setree*/
+	for (int i = 0; i < nsuper - 1; ++i)
+	{
+	    int_t ftree = etree[xsup[i + 1] - 1];
+	    if (ftree < xsup[nsuper])
+		{
+		    setree[i] = supno[etree[xsup[i + 1] - 1]];
+		}
+	}
+	return setree;
+}
+/*takes supernodal elimination tree and for each
+supernode calculates "level" in elimination tree*/
+int_t* topological_ordering(int_t nsuper, int_t* setree)
+{
+    // int_t *tsort_setree = malloc(sizeof(int_t) * nsuper);
+    int_t *tsort_setree = intMalloc_dist(nsuper); // Sherry fix
+	for (int i = 0; i < nsuper; ++i)
+	{
+	    tsort_setree[i] = 0; /*initializing all levels to zero*/
+	}
+	for (int i = 0; i < nsuper - 1; ++i)
+	{
+	    /*level of parent = MAX(level_of_children()+1)*/
+	    tsort_setree[setree[i]] = SUPERLU_MAX(tsort_setree[setree[i]], tsort_setree[i] + 1);
+	}
+	return tsort_setree;
+}
+
+
+treeList_t* setree2list(int_t nsuper, int_t* setree )
+{
+    treeList_t* treeList = (treeList_t* ) SUPERLU_MALLOC (sizeof(treeList_t) * (nsuper + 1));
+
+	// initialize the struct
+	for (int i = 0; i < nsuper + 1; ++i)
+	{
+	    treeList[i].numChild = 0;
+	    treeList[i].numDescendents = 1;	/*numdescen includes myself*/
+	    treeList[i].left = -1;
+	    treeList[i].right = -1;
+	    treeList[i].right = -1;
+	    treeList[i].depth = 0;
+	}
+	for (int i = 0; i < nsuper; ++i)
+	{
+	    // updating i-th supernodes parents
+	    int_t parenti = setree[i];
+	    treeList[parenti].numDescendents +=  treeList[i].numDescendents;
+	    treeList[parenti].numChild++;
+	}
+
+	/*allocate memory for children lists*/
+	for (int i = 0; i < nsuper + 1; ++i)
+	{
+	    //treeList[i].childrenList = INT_T_ALLOC (treeList[i].numChild);
+	    treeList[i].childrenList = intMalloc_dist(treeList[i].numChild);
+	    treeList[i].numChild = 0;
+	}
+
+	for (int i = 0; i < nsuper; ++i)
+	{
+	    // updating i-th supernodes parents
+	    int_t parenti = setree[i];
+	    treeList[parenti].childrenList[treeList[parenti].numChild] = i;
+	    treeList[parenti].numChild++;
+	}
+
+	return treeList;
+
+} /* setree2list */
+
+// Sherry added 
+int  free_treelist(int_t nsuper, treeList_t* treeList)
+{
+    for (int i = 0; i < nsuper + 1; ++i) {
+	SUPERLU_FREE(treeList[i].childrenList);
+    }
+    SUPERLU_FREE(treeList);
+    return 0;
+}
+
+int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup)
+{
+	if (getenv("WF"))
+	{
+		if (strcmp(getenv("WF"), "One" ) == 0)
+		{
+			for (int i = 0; i < nsupers; ++i)
+			{
+				treeList[i].weight = 1.0;
+			}
+		}
+		else if (strcmp(getenv("WF"), "Ns" ) == 0)
+		{
+			for (int i = 0; i < nsupers; ++i)
+			{
+				double sz = 1.0 *  SuperSize(i);
+				treeList[i].weight = sz;
+			}
+		}
+		else if (strcmp(getenv("WF"), "NsDep" ) == 0)
+		{
+			for (int i = 0; i < nsupers; ++i)
+			{
+				double dep = 1.0 * treeList[i].depth ;
+				double sz = 1.0 *  SuperSize(i);
+				treeList[i].weight = sz * dep;
+			}
+		}
+		else if (strcmp(getenv("WF"), "NsDep2" ) == 0)
+		{
+			for (int i = 0; i < nsupers; ++i)
+			{
+				double dep = 1.0 * treeList[i].depth ;
+				double sz = 1.0 *  SuperSize(i);
+				treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ;
+
+			}
+
+		}
+		else
+		{
+			for (int i = 0; i < nsupers; ++i)
+			{
+				treeList[i].weight = treeList[i].scuWeight;
+			}
+		}
+	}
+	else
+	{
+		for (int i = 0; i < nsupers; ++i)
+		{
+			treeList[i].weight = treeList[i].scuWeight;
+
+		}
+	}
+
+	return 0;
+} /* estimateWeight */
+
+
+int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup)
+{
+
+	// initializing naive weight
+	for (int i = 0; i < nsupers; ++i)
+	{
+		treeList[i].depth = 0;
+	}
+
+	for (int i = nsupers - 1; i > -1; --i)
+	{
+		/* code */
+		int_t myDep = treeList[i].depth;
+		for (int cIdx = 0; cIdx < treeList[i].numChild; ++cIdx)
+		{
+			/* code */
+			int_t child = treeList[i].childrenList[cIdx];
+			treeList[child].depth = myDep + SuperSize(i) ;
+
+		}
+	}
+
+
+	// for (int i = 0; i < nsupers; ++i)
+	// {
+
+	// 	// treeList[i].weight = 1.0 * treeList[i].numDescendents;
+	// 	double dep = 1.0 * treeList[i].depth ;
+	// 	double sz = 1.0 *  SuperSize(i);
+	// 	treeList[i].weight = 1.0;
+	// 	treeList[i].weight = sz;
+	// 	treeList[i].weight = sz * sz * sz;
+	// 	treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ;
+	// 	treeList[i].weight = treeList[i].scuWeight;
+	// 	// treeList[i].treeWeight = treeList[i].weight;
+	// 	// treeList[i].depth = 0;
+	// }
+
+	estimateWeight(nsupers, setree, treeList, xsup);
+
+	for (int i = 0; i < nsupers; ++i)
+	{
+		treeList[i].iWeight = treeList[i].weight;
+	}
+
+
+	for (int i = 0; i < nsupers; ++i)
+	{
+		int_t parenti = setree[i];
+		treeList[parenti].iWeight += treeList[i].iWeight;
+	}
+
+
+	return 0;
+
+} /* calcTreeWeight */
+
+
+int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree)
+{
+	FILE* fp = fopen(sname, "w");
+	/*beginning of the file */
+	fprintf(fp, "//dot file generated by pdgstrf\n");
+	fprintf(fp, "digraph elimination_tree {\n");
+	for (int i = 0; i < nnodes; ++i)
+	{
+	    /* code */
+	  fprintf(fp, IFMT " -> " IFMT ";\n", dlist[i], setree[dlist[i]]);
+	}
+	/*end of the file */
+	fprintf(fp, "}\n");
+	fprintf(fp, "//EOF\n");
+	fclose(fp);
+	return 0;
+}
+
+int_t getDescendList(int_t k, int_t*dlist,  treeList_t* treeList)
+// post order traversal
+{
+	if (k < 0) return 0;
+
+	int_t cDesc = 0;
+
+	for (int_t child = 0; child < treeList[k].numChild; ++child)
+	{
+		/* code */
+		int_t nChild = treeList[k].childrenList[child];
+		cDesc += getDescendList(nChild, dlist + cDesc, treeList);
+	}
+
+	dlist[cDesc] = k;
+	return cDesc + 1;
+}
+
+
+int_t getCommonAncsCount(int_t k, treeList_t* treeList)
+{
+	// given a supernode k, give me the list of ancestors nodes
+	int_t cur = k;
+	int_t count = 1;
+	while (treeList[cur].numChild == 1)
+	{
+		cur = treeList[cur].childrenList[0];
+		count++;
+	}
+	return count;
+}
+int_t getCommonAncestorList(int_t k, int_t* alist,  int_t* seTree, treeList_t* treeList)
+{
+	// given a supernode k, give me the list of ancestors nodes
+	int_t cur = k;
+	int_t count = 1;
+	while (treeList[cur].numChild == 1)
+	{
+		cur = treeList[cur].childrenList[0];
+		count++;
+	}
+
+
+	alist[0] = cur;
+	for (int i = 1; i < count; ++i)
+	{
+		/* code */
+		alist[i] = seTree[cur];
+		cur = seTree[cur];
+	}
+	return count;
+}
+
+int cmpfunc (const void * a, const void * b)
+{
+	return ( *(int_t*)a - * (int_t*)b );
+}
+
+int_t* getPermNodeList(int_t nnode, 	// number of nodes
+                       int_t* nlist, int_t* perm_c_sup, int_t* iperm_c_sup)
+//from list of nodes, get permutation of factorization
+{
+	int_t* perm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode);
+	int_t* iperm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode);
+	for (int_t i = 0; i < nnode; ++i)
+	{
+		/* code */
+		// printf("%d %d %d\n",i, nlist[i],iperm_c_sup[nlist[i]] );
+		iperm_l[i] = iperm_c_sup[nlist[i]]; //order of factorization
+	}
+	qsort(iperm_l, nnode, sizeof(int_t), cmpfunc);
+
+	for (int_t i = 0; i < nnode; ++i)
+	{
+		/* code */
+		perm_l[i] = perm_c_sup[iperm_l[i]]; //order of factorization
+	}
+	SUPERLU_FREE(iperm_l);
+	return perm_l;
+}
+int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder)
+// calculates EtreeLB boundaries for given list of nodes, via perm_l
+{
+	//calculate minimum and maximum topOrder
+	int minTop, maxTop;
+	minTop = gTopOrder[perm_l[0]];
+	maxTop = gTopOrder[perm_l[nnodes - 1]];
+	int numLB = maxTop - minTop + 2;
+	//int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB);
+	int_t* lEtreeLB = (int_t *) intMalloc_dist(numLB); // Sherry fix
+	for (int i = 0; i < numLB; ++i)
+	{
+		/* initalize */
+		lEtreeLB[i] = 0;
+	}
+	lEtreeLB[0] = 0;
+	int curLevel = minTop;
+	int curPtr = 1;
+	for (int i = 0; i < nnodes ; ++i)
+	{
+		/* code */
+		if (curLevel != gTopOrder[perm_l[i]])
+		{
+			/* creset */
+			curLevel = gTopOrder[perm_l[i]];
+			lEtreeLB[curPtr] = i;
+			curPtr++;
+		}
+	}
+	lEtreeLB[curPtr] = lEtreeLB[curPtr - 1] + 1;
+	printf("numLB=%d curPtr=%d \n", numLB, curPtr);
+	for (int i = 0; i < numLB; ++i)
+	{
+	    printf(IFMT, lEtreeLB[i]);
+	}
+
+	return lEtreeLB;
+}
+
+int_t* getSubTreeRoots(int_t k, treeList_t* treeList)
+{
+	int_t* srootList = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * 2);
+	int_t cur = k;
+	while (treeList[cur].numChild == 1 && cur > 0)
+	{
+		cur = treeList[cur].childrenList[0];
+	}
+
+	if (treeList[cur].numChild == 2)
+	{
+		/* code */
+		srootList[0] = treeList[cur].childrenList[0];
+		srootList[1] = treeList[cur].childrenList[1];
+		// printf("Last node =%d, numchilds=%d,  desc[%d] = %d, desc[%d] = %d \n ",
+		// 	cur, treeList[cur].numChild,
+		// 	srootList[0], treeList[srootList[0]].numDescendents,
+		// 	srootList[1], treeList[srootList[1]].numDescendents );
+	}
+	else
+	{
+		/* code */
+		srootList[0] = -1;
+		srootList[1] = -1;
+	}
+
+	return srootList;
+}
+
+int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount)
+// tests disjoint and union
+{
+    //int_t* slist = (int_t* ) malloc(sizeof(int_t) * nsupers);
+    int_t* slist = intMalloc_dist(nsupers); // Sherry fix
+	/*intialize each entry with zero */
+	for (int_t i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		slist[i] = 0;
+	}
+	for (int_t list = 0; list < numList; ++list)
+	{
+		/* code */
+		for (int_t nd = 0; nd < nodeCount[list]; ++nd)
+		{
+			slist[nodeList[list][nd]]++;
+		}
+	}
+
+	for (int_t i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		assert(slist[i] == 1);
+	}
+	printf("testSubtreeNodelist Passed\n");
+	SUPERLU_FREE(slist);
+	return 0;
+}
+int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel)
+{
+	// checking monotonicity
+	for (int i = 0; i < nodeCount - 1; ++i)
+	{
+	    if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]))
+	      {
+		  /* code */
+		printf("%d :" IFMT "(" IFMT ")" IFMT "(" IFMT ")\n", i,
+		       permList[i], gTopLevel[permList[i]],
+		       permList[i + 1], gTopLevel[permList[i + 1]] );
+	      }
+	    assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]);
+	}
+#if 0
+	int_t* slist = (int_t* ) malloc(sizeof(int_t) * nodeCount);
+	int_t* plist = (int_t* ) malloc(sizeof(int_t) * nodeCount);
+#else
+	int_t* slist = intMalloc_dist(nodeCount);
+	int_t* plist = intMalloc_dist(nodeCount);
+#endif
+	// copy lists
+	for (int_t i = 0; i < nodeCount; ++i)
+	{
+		slist[i] = nodeList[i];
+		plist[i] = permList[i];
+	}
+	// sort them
+	qsort(slist, nodeCount, sizeof(int_t), cmpfunc);
+	qsort(plist, nodeCount, sizeof(int_t), cmpfunc);
+	for (int_t i = 0; i < nodeCount; ++i)
+	{
+		assert( slist[i] == plist[i]);
+	}
+	printf("permList Test Passed\n");
+
+	SUPERLU_FREE(slist);
+	SUPERLU_FREE(plist);
+
+	return 0;
+}
+
+
+int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes);
+
+// Sherry: the following routine is not called ??
+int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms)
+{
+	// merges three permutations
+	int_t nn = 0;
+	//add permutations
+	for (int i = 0; i < nperms; ++i)
+	{
+		nn += nnodes[i];
+	}
+
+	// alloc address
+	//int_t* gperm = (int_t*) malloc(nn * sizeof(int_t));
+	int_t* gperm = intMalloc_dist(nn);  // Sherry fix
+
+	//now concatenat arrays
+	int ptr = 0;
+	for (int tr = 0; tr < nperms; ++tr)
+	{
+	    /* code */
+	    for (int nd = 0; nd < nnodes[tr]; ++nd)
+	      {
+		/* code */
+		gperm[ptr] = perms[tr][nd];
+		printf("%d %d %d" IFMT "\n", tr, ptr, nd, perms[tr][nd] );
+		ptr++;
+	      }
+	}
+	mergPermTest( nperms, gperm, nnodes);
+	return gperm;
+} /* merg_perms */
+
+int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes)
+{
+	// merges three permutations
+	int_t nn = 0;
+	//add permutations
+	for (int i = 0; i < nperms; ++i)
+	{
+		nn += nnodes[i];
+	}
+
+	// alloc address
+	// int_t* tperm = (int_t*) malloc(nn * sizeof(int_t));
+	int_t* tperm = intMalloc_dist(nn);  // Sherry fix
+
+	for (int i = 0; i < nn; ++i)
+	{
+		tperm[i] = 0;
+	}
+	for (int i = 0; i < nn; ++i)
+	{
+		/* code */
+		printf("%d" IFMT "\n", i, gperms[i] );
+		tperm[gperms[i]]++;
+	}
+	for (int i = 0; i < nn; ++i)
+	{
+		/* code */
+		assert(tperm[i] == 1);
+	}
+	SUPERLU_FREE(tperm);
+	return nn;
+} /* mergPermTest */
+
+#if 0 // Sherry: not called anymore
+int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+		superlu_dist_options_t *options,
+                LocalLU_t *Llu, int_t* xsup,
+                int_t num_look_aheads, int_t nsupers, int_t * iperm_c_supno)
+{
+	/* constructing look-ahead table to indicate the last dependency */
+	int_t iam = grid->iam;
+	int_t Pc = grid->npcol;
+	int_t  Pr = grid->nprow;
+	int_t  myrow = MYROW (iam, grid);
+	int_t  mycol = MYCOL (iam, grid);
+	int_t   ncb = nsupers / Pc;
+	int_t  nrb = nsupers / Pr;
+	stat->num_look_aheads = num_look_aheads;
+	int* look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int));
+	int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
+	for (int_t lb = 0; lb < nsupers; lb++)
+		look_ahead_l[lb] = -1;
+	/* go through U-factor */
+	for (int_t lb = 0; lb < nrb; ++lb)
+	{
+		int_t ib = lb * Pr + myrow;
+		int_t* index = Llu->Ufstnz_br_ptr[lb];
+		if (index)              /* Not an empty row */
+		{
+			int_t k = BR_HEADER;
+			for (int_t j = 0; j < index[0]; ++j)
+			{
+				int_t jb = index[k];
+				if (jb != ib)
+					look_ahead_l[jb] =
+					    SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+				k += UB_DESCRIPTOR + SuperSize (index[k]);
+			}
+		}
+	}
+	if (myrow < nsupers % grid->nprow)
+	{
+		int_t ib = nrb * Pr + myrow;
+		int_t*  index = Llu->Ufstnz_br_ptr[nrb];
+		if (index)              /* Not an empty row */
+		{
+			int_t k = BR_HEADER;
+			for (int_t j = 0; j < index[0]; ++j)
+			{
+				int_t jb = index[k];
+				if (jb != ib)
+					look_ahead_l[jb] =
+					    SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+				k += UB_DESCRIPTOR + SuperSize (index[k]);
+			}
+		}
+	}
+	if (options->SymPattern == NO)
+	{
+		/* go through L-factor */
+		for (int_t lb = 0; lb < ncb; lb++)
+		{
+			int_t ib = lb * Pc + mycol;
+			int_t* index = Llu->Lrowind_bc_ptr[lb];
+			if (index)
+			{
+				int_t k = BC_HEADER;
+				for (int_t j = 0; j < index[0]; j++)
+				{
+					int_t jb = index[k];
+					if (jb != ib)
+						look_ahead_l[jb] =
+						    SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+					k += LB_DESCRIPTOR + index[k + 1];
+				}
+			}
+		}
+		if (mycol < nsupers % grid->npcol)
+		{
+			int_t ib = ncb * Pc + mycol;
+			int_t* index = Llu->Lrowind_bc_ptr[ncb];
+			if (index)
+			{
+				int_t k = BC_HEADER;
+				for (int_t j = 0; j < index[0]; j++)
+				{
+					int_t jb = index[k];
+					if (jb != ib)
+						look_ahead_l[jb] =
+						    SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]);
+					k += LB_DESCRIPTOR + index[k + 1];
+				}
+			}
+		}
+	}
+	MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX,
+	               grid->comm);
+	SUPERLU_FREE (look_ahead_l);
+	return look_ahead;
+}
+
+int* getLastDepBtree( int_t nsupers, treeList_t* treeList)
+{
+	int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int));
+	for (int i = 0; i < nsupers; ++i)
+	{
+		look_ahead[i] = -1;
+	}
+	for (int k = 0; k < nsupers; ++k)
+	{
+		/* code */
+		for (int_t child = 0; child < treeList[k].numChild; ++child)
+		{
+			/* code */
+			switch ( child)
+			{
+			case 0:
+				look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].left);
+				break;
+			case 1:
+				look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].right);
+				break;
+			case 2:
+				look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].extra);
+				break;
+			default:
+				break;
+			}
+		}
+	}
+	return look_ahead;
+}
+
+#endif // Sherry: not called anymore
+
+
+int_t* getGlobal_iperm(int_t nsupers, int_t nperms,  // number of permutations
+                       int_t** perms, 		// array of permutations
+                       int_t* nnodes 		// number of nodes in each permutation
+                      )
+{
+	int_t*  gperm = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	int_t*  giperm = SUPERLU_MALLOC (nsupers * sizeof (int_t));
+	int_t ptr = 0;
+	for (int_t perm = 0; perm < nperms; ++perm)
+	{
+		/* code */
+		for (int_t node = 0; node < nnodes[perm]; ++node)
+		{
+			/* code */
+			gperm[ptr] = perms[perm][node];
+			ptr++;
+		}
+	}
+	assert(ptr == nsupers);
+	for (int_t i = 0; i < nsupers; ++i)
+	{
+		giperm[gperm[i]] = i;
+	}
+	SUPERLU_FREE(gperm);
+	return giperm;
+}
+int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList)
+{
+	int_t numTrees = (1 << maxLvl) - 1;
+	int_t* treeHeads = SUPERLU_MALLOC (numTrees * sizeof (int_t));
+	// for (int i = 0; i < numTrees; ++i)
+	// {
+	// 	/* code */
+	// 	treeHeads[i]=0;
+	// }
+	treeHeads[0] = nsupers - 1;
+	for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl)
+	{
+		/* code */
+		int_t st = (1 << lvl) - 1;
+		int_t end = 2 * st + 1;
+		for (int_t i = st; i < end; ++i)
+		{
+			/* code */
+			int_t * sroots;
+			sroots = getSubTreeRoots(treeHeads[i], treeList);
+			treeHeads[2 * i + 1] = sroots[0];
+			treeHeads[2 * i + 2] = sroots[1];
+			SUPERLU_FREE(sroots);
+		}
+	}
+	return treeHeads;
+}
+
+int_t* calcNumNodes(int_t maxLvl,  int_t* treeHeads, treeList_t* treeList)
+{
+	int_t numTrees = (1 << maxLvl) - 1;
+	int_t* nnodes = SUPERLU_MALLOC (numTrees * sizeof (int_t));
+	for (int_t i = 0; i < numTrees; ++i)
+	{
+		/* code */
+		if (treeHeads[i] > -1)
+		{
+			/* code */
+			nnodes[i] = treeList[treeHeads[i]].numDescendents;
+		}
+		else
+		{
+			nnodes[i] = 0;
+		}
+
+	}
+	for (int_t i = 0; i < numTrees / 2 ; ++i)
+	{
+		/* code */
+		nnodes[i] -= (nnodes[2 * i + 1] + nnodes[2 * i + 2]);
+	}
+	return nnodes;
+}
+
+int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes,
+                    int_t* treeHeads, treeList_t* treeList)
+{
+	int_t numTrees = (1 << maxLvl) - 1;
+	int_t** nodeList = SUPERLU_MALLOC (numTrees * sizeof (int_t*));
+	for (int_t i = 0; i < numTrees; ++i)
+	{
+		/* code */
+		if (nnodes[i] > 0)
+		{
+			nodeList[i] = SUPERLU_MALLOC (nnodes[i] * sizeof (int_t));
+			assert(nodeList[i]);
+		}
+		else
+		{
+			nodeList[i] = NULL;
+		}
+
+	}
+
+	for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl)
+	{
+		/* code */
+		int_t st = (1 << lvl) - 1;
+		int_t end = 2 * st + 1;
+		for (int_t i = st; i < end; ++i)
+		{
+			/* code */
+			if (nodeList[i])
+				getCommonAncestorList(treeHeads[i], nodeList[i],  setree, treeList);
+		}
+	}
+
+	int_t st = (1 << (maxLvl - 1)) - 1;
+	int_t end = 2 * st + 1;
+	for (int_t i = st; i < end; ++i)
+	{
+		/* code */
+		getDescendList(treeHeads[i], nodeList[i],  treeList);
+	}
+	return nodeList;
+}
+
+int_t* getGridTrees( gridinfo3d_t* grid3d)
+{
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t* myTreeIdx = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t));
+	myTreeIdx[0] = grid3d->zscp.Np - 1 + grid3d->zscp.Iam ;
+	for (int i = 1; i < maxLvl; ++i)
+	{
+		/* code */
+		myTreeIdx[i] = (myTreeIdx[i - 1] - 1) / 2;
+	}
+	return myTreeIdx;
+}
+
+int_t* getReplicatedTrees( gridinfo3d_t* grid3d)
+{
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t* myZeroTrIdxs = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t));
+	for (int i = 0; i < maxLvl; ++i)
+	{
+		/* code */
+		if (grid3d->zscp.Iam %  (1 << i) )
+		{
+			myZeroTrIdxs[i] = 1;
+		}
+		else
+		{
+			myZeroTrIdxs[i] = 0;
+		}
+	}
+	return myZeroTrIdxs;
+}
+
+
+int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm)
+{
+	if (nnodes < 0) return NULL;
+	int_t* myIperm =  INT_T_ALLOC(nsupers);
+	for (int_t i = 0; i < nsupers; ++i)
+	{
+		/* code */
+		myIperm[i] = -1;
+	}
+	for (int_t i = 0; i < nnodes; ++i)
+	{
+		/* code */
+		assert(myPerm[i] < nsupers);
+		myIperm[myPerm[i]] = i;
+	}
+	return myIperm;
+}
+int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree )
+{
+	if (nnodes < 0) return NULL;
+	int_t* myTopOrder =  INT_T_ALLOC(nnodes);
+	for (int_t i = 0; i < nnodes; ++i)
+	{
+		myTopOrder[i] = 0; /*initializing all levels to zero*/
+	}
+	for (int_t i = 0; i < nnodes - 1; ++i)
+	{
+		/*level of parent = MAX(level_of_children()+1)*/
+		int_t inode = myPerm[i];
+		int_t iparent = setree[inode];
+		int_t iparentIdx  = myIperm[iparent];
+		// if(iparentIdx >= nnodes) printf("%d %d %d %d \n", inode, iparent, nnodes, iparentIdx);
+		// assert(iparentIdx < nnodes);
+		// if (iparentIdx != -1)
+		if (0<= iparentIdx && iparentIdx<nnodes )
+			/*if my parent is in my tree only*/
+		{
+			myTopOrder[iparentIdx] = SUPERLU_MAX(myTopOrder[iparentIdx], myTopOrder[i] + 1);
+		}
+	}
+	return myTopOrder;
+}
+int_t checkConsistancyPermTopOrder(int_t nnodes, int_t* myTopOrder)
+/*Ideally top order should be monotonically increasing*/
+{
+	for (int_t i = 0; i < nnodes - 1; ++i)
+	{
+		assert(myTopOrder[i] <= myTopOrder[i + 1]);
+	}
+	return 0;
+}
+int_t* getMyEtLims(int_t nnodes, int_t* myTopOrder)
+{
+	if (nnodes < 0) return NULL;
+	checkConsistancyPermTopOrder(nnodes, myTopOrder);
+	int_t numLvl = myTopOrder[nnodes - 1] + 1;
+	int_t* myTopLims = INT_T_ALLOC(numLvl + 1);
+	for (int i = 0; i < numLvl + 1; ++i)
+	{
+		myTopLims[i] = 0;
+	}
+	int_t nxtLvl = 1;
+	for (int_t i = 0; i < nnodes - 1; ++i)
+	{
+		/* code */
+		if (myTopOrder[i] != myTopOrder[i + 1])
+		{
+			/* code */
+			myTopLims[nxtLvl] = i + 1;
+			nxtLvl++;
+		}
+	}
+	assert(nxtLvl == numLvl);
+	myTopLims[numLvl] = nnodes;
+
+	return myTopLims;
+}
+
+treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t  nsupers,
+                                 int_t* myPerm, int_t* setree)
+{
+	treeTopoInfo_t ttI;
+	int_t* myIperm = getMyIperm(nnodes, nsupers, myPerm);
+	int_t* myTopOrder =  getMyTopOrder(nnodes, myPerm, myIperm, setree );
+	ttI.myIperm = myIperm;
+	ttI.numLvl = myTopOrder[nnodes - 1] + 1;
+	ttI.eTreeTopLims = getMyEtLims(nnodes, myTopOrder);
+	return ttI;
+}
+
+// Sherry: the following function is not called ??
+/*calculated boundries of the topological levels*/
+int_t* Etree_LevelBoundry(int_t* perm, int_t* tsort_etree, int_t nsuper)
+{
+	int_t max_level =  tsort_etree[nsuper - 1] + 1;
+	//int_t *Etree_LvlBdry = malloc(sizeof(int_t) * (max_level + 1));
+	int_t *Etree_LvlBdry = intMalloc_dist(max_level + 1); // Sherry fix
+	Etree_LvlBdry[0] = 0;
+	/*calculate end of boundries for each level*/
+	for (int_t i = 0; i < max_level; ++i)
+	{
+		/* code */
+		int_t st = 0;
+		if (i > 0)
+		{
+			/* code */
+			st = Etree_LvlBdry[i];
+		}
+		for (int_t j = st; j < nsuper; ++j)
+		{
+			/* code */
+			if (tsort_etree[perm[j]] == i + 1)
+			{
+				/* code */
+				Etree_LvlBdry[i + 1] = j;
+				break;
+			}
+		}
+	}
+	Etree_LvlBdry[max_level] = nsuper;
+	return Etree_LvlBdry;
+}
+
+int_t* calculate_num_children(int_t nsuper, int_t* setree)
+{
+    //int_t* etree_num_children = malloc(sizeof(int_t) * (nsuper));
+    int_t* etree_num_children = intMalloc_dist(nsuper); // Sherry fix
+	for (int_t i = 0; i < nsuper; ++i)
+	{
+		/*initialize num children to zero*/
+		etree_num_children[i] = 0;
+	}
+	for (int_t i = 0; i < nsuper; i++)
+	{
+		if (setree[i] < nsuper)
+			etree_num_children[setree[i]]++;
+	}
+	return etree_num_children;
+}
+void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper)
+{
+	for (int i = 0; i < max_level; ++i)
+	{
+		int st = 0;
+		int ed = nsuper;
+		st = Etree_LvlBdry[i];
+		ed = Etree_LvlBdry[i + 1];
+		printf("Level %d, NumSuperNodes=%d,\t Start=%d end=%d\n", i, ed - st, st, ed);
+	}
+}
+
+void print_etree_leveled(int_t *setree,  int_t* tsort_etree, int_t nsuper)
+{
+	FILE* fp = fopen("output_sorted.dot", "w");
+	int max_level =  tsort_etree[nsuper - 1];
+	/*beginning of the file */
+	fprintf(fp, "//dot file generated by pdgstrf\n");
+	fprintf(fp, "digraph elimination_tree {\n");
+	fprintf(fp, "labelloc=\"t\";\n");
+	fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level);
+
+	for (int i = 0; i < nsuper - 1; ++i)
+	{
+		/* code */
+		// fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]);
+		fprintf(fp, "%d -> " IFMT ";\n", i, setree[i]);
+	}
+	/*adding rank information*/
+	for (int i = 0; i < max_level; ++i)
+	{
+		fprintf(fp, "{ rank=same; ");
+		for (int j = 0; j < nsuper; ++j)
+		{
+			if (tsort_etree[j] == i)
+				fprintf(fp, "%d ", j);
+		}
+		fprintf(fp, "}\n");
+	}
+	/*end of the file */
+	fprintf(fp, "}\n");
+	fprintf(fp, "//EOF\n");
+	fclose(fp);
+}
+
+
+void printEtree(int_t nsuper, int_t *setree, treeList_t* treeList)
+{
+	FILE* fp = fopen("output_sorted.dot", "w");
+	// int_t max_level =  tsort_etree[nsuper - 1];
+	/*beginning of the file */
+	fprintf(fp, "//dot file generated by pdgstrf\n");
+	fprintf(fp, "digraph elimination_tree {\n");
+	// fprintf(fp, "labelloc=\"t\";\n");
+	// fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level);
+
+	for (int i = 0; i < nsuper - 1; ++i)
+	{
+		/* code */
+		// fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]);
+		fprintf(fp, " \"%d|%ld\" -> \"%ld|%ld\";\n", i, treeList[i].depth,
+		        (long int) setree[i], (long int) treeList[setree[i]].depth);
+	}
+
+	/*end of the file */
+	fprintf(fp, "}\n");
+	fprintf(fp, "//EOF\n");
+	fclose(fp);
+}
+
+
+void print_etree(int_t *setree, int_t* iperm, int_t nsuper)
+{
+	FILE* fp = fopen("output.dot", "w");
+	/*beginning of the file */
+	fprintf(fp, "//dot file generated by pdgstrf\n");
+	fprintf(fp, "digraph elimination_tree {\n");
+	for (int i = 0; i < nsuper; ++i)
+	{
+	    /* code */
+	    fprintf(fp, IFMT " -> " IFMT ";\n", iperm[i], iperm[setree[i]]);
+	}
+	/*end of the file */
+	fprintf(fp, "}\n");
+	fprintf(fp, "//EOF\n");
+	fclose(fp);
+}
diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c
new file mode 100644
index 00000000..4a27e531
--- /dev/null
+++ b/SRC/sutil_dist.c
@@ -0,0 +1,977 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Several matrix utilities
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * October 5, 2021
+ */
+
+#include <math.h>
+#include "superlu_sdefs.h"
+
+void
+sCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz,
+			    float *nzval, int_t *rowind, int_t *colptr,
+			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NCformat *) A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+sCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
+			       int_t nnz_loc, int_t m_loc, int_t fst_row,
+			       float *nzval, int_t *colind, int_t *rowptr,
+			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat_loc *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NRformat_loc *) A->Store;
+    Astore->nnz_loc = nnz_loc;
+    Astore->fst_row = fst_row;
+    Astore->m_loc = m_loc;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/*! \brief Convert a row compressed storage into a column compressed storage.
+ */
+void
+sCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz,
+                         float *a, int_t *colind, int_t *rowptr,
+                         float **at, int_t **rowind, int_t **colptr)
+{
+    register int i, j, col, relpos;
+    int_t *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (float *) floatMalloc_dist(nnz);
+    *rowind = intMalloc_dist(nnz);
+    *colptr = intMalloc_dist(n+1);
+    marker = intCalloc_dist(n);
+
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+/*! \brief Copy matrix A into matrix B. */
+void
+sCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void sPrint_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nCompCol matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NCformat *) A->Store;
+    printf("nrow %lld, ncol %lld, nnz %lld\n", (long long) A->nrow,
+	    (long long) A->ncol, (long long) Astore->nnz);
+    if ( (dp = (float *) Astore->nzval) != NULL ) {
+        printf("nzval:\n");
+        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
+    }
+    printf("\nrowind:\n");
+    for (i = 0; i < Astore->nnz; ++i)
+        printf("%lld  ", (long long) Astore->rowind[i]);
+    printf("\ncolptr:\n");
+    for (i = 0; i <= A->ncol; ++i)
+        printf("%lld  ", (long long) Astore->colptr[i]);
+    printf("\nend CompCol matrix.\n");
+}
+
+void sPrint_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nDense matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    printf("nrow %lld, ncol %lld, lda %lld\n",
+        (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
+    printf("\nend Dense matrix.\n");
+}
+
+int sPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc  *Astore;
+    int_t  nnz_loc, m_loc;
+    float  *dp;
+
+    printf("\n==== CompRowLoc matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    printf("nrow %ld, ncol %ld\n",
+            (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
+    PrintInt10("colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        Printfloat5("nzval", nnz_loc, dp);
+    printf("==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+int file_sPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
+{
+    NRformat_loc     *Astore;
+    int_t  nnz_loc, m_loc;
+    float       *dp;
+
+    fprintf(fp, "\n==== CompRowLoc matrix: ");
+    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    fprintf(fp, "nrow %ld, ncol %ld\n", (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    fprintf(fp, "nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
+    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        file_Printfloat5(fp, "nzval", nnz_loc, dp);
+    fprintf(fp, "==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+void
+sCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, float *x,
+			  int_t ldx, Stype_t stype, Dtype_t dtype,
+			  Mtype_t mtype)
+{
+    DNformat    *Xstore;
+
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (float *) x;
+}
+
+void
+sCopy_Dense_Matrix_dist(int_t M, int_t N, float *X, int_t ldx,
+			float *Y, int_t ldy)
+{
+/*! \brief
+ *
+ * <pre>
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ * </pre>
+ */
+    int    i, j;
+
+    for (j = 0; j < N; ++j)
+        for (i = 0; i < M; ++i)
+            Y[i + j*ldy] = X[i + j*ldx];
+}
+
+void
+sCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz,
+			      float *nzval, int_t *nzval_colptr,
+			      int_t *rowind, int_t *rowind_colptr,
+			      int_t *col_to_sup, int_t *sup_to_col,
+			      Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    SCformat *Lstore;
+
+    L->Stype = stype;
+    L->Dtype = dtype;
+    L->Mtype = mtype;
+    L->nrow = m;
+    L->ncol = n;
+    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
+    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
+    Lstore = L->Store;
+    Lstore->nnz = nnz;
+    Lstore->nsuper = col_to_sup[n];
+    Lstore->nzval = nzval;
+    Lstore->nzval_colptr = nzval_colptr;
+    Lstore->rowind = rowind;
+    Lstore->rowind_colptr = rowind_colptr;
+    Lstore->col_to_sup = col_to_sup;
+    Lstore->sup_to_col = sup_to_col;
+
+}
+
+/**** The following utilities are added per request of SUNDIALS ****/
+
+/*! \brief Clone: Allocate memory for a new matrix B, which is of the same type
+ *  and shape as A.
+ *  The clone operation would copy all the non-pointer structure members like
+ *  nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store
+ *  structure. It would also copy nnz_loc, m_loc, fst_row from A->Store
+ *  into B->Store. It does not copy the matrix entries, row pointers,
+ *  or column indices.
+ */
+void sClone_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NRformat_loc  *Astore, *Bstore;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = A->ncol;
+    Astore   = (NRformat_loc *) A->Store;
+    B->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(B->Store) ) ABORT("SUPERLU_MALLOC fails for B->Store");
+    Bstore = (NRformat_loc *) B->Store;
+
+    Bstore->nnz_loc = Astore->nnz_loc;
+    Bstore->m_loc = Astore->m_loc;
+    Bstore->fst_row = Astore->fst_row;
+    if ( !(Bstore->nzval = (float *) floatMalloc_dist(Bstore->nnz_loc)) )
+	ABORT("floatMalloc_dist fails for Bstore->nzval");
+    if ( !(Bstore->colind = (int_t *) intMalloc_dist(Bstore->nnz_loc)) )
+	ABORT("intMalloc_dist fails for Bstore->colind");
+    if ( !(Bstore->rowptr = (int_t *) intMalloc_dist(Bstore->m_loc + 1)) )
+	ABORT("intMalloc_dist fails for Bstore->rowptr");
+
+    return;
+}
+
+/* \brief Copy: copies all entries, row pointers, and column indices of
+ *  a matrix into another matrix of the same type,
+ *  B_{i,j}=A_{i,j}, for i,j=1,...,n
+ */
+void sCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NRformat_loc  *Astore, *Bstore;
+
+    Astore = (NRformat_loc *) A->Store;
+    Bstore = (NRformat_loc *) B->Store;
+
+    memcpy(Bstore->nzval, Astore->nzval, Astore->nnz_loc * sizeof(float));
+    memcpy(Bstore->colind, Astore->colind, Astore->nnz_loc * sizeof(int_t));
+    memcpy(Bstore->rowptr, Astore->rowptr, (Astore->m_loc+1) * sizeof(int_t));
+
+    return;
+}
+
+/*! \brief Sets all entries of a matrix to zero, A_{i,j}=0, for i,j=1,..,n */
+void sZero_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    float zero = 0.0;
+    NRformat_loc  *Astore = A->Store;
+    float *aval;
+    int_t i;
+
+    aval = (float *) Astore->nzval;
+    for (i = 0; i < Astore->nnz_loc; ++i) aval[i] = zero;
+
+    return;
+}
+
+/*! \brief Scale and add I: scales a matrix and adds an identity.
+ *  A_{i,j} = c * A_{i,j} + \delta_{i,j} for i,j=1,...,n and
+ *  \delta_{i,j} is the Kronecker delta.
+ */
+void sScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *A, float c)
+{
+    float one = 1.0;
+    NRformat_loc  *Astore = A->Store;
+    float *aval = (float *) Astore->nzval;
+    int i, j;
+    float temp;
+
+    for (i = 0; i < Astore->m_loc; ++i) { /* Loop through each row */
+        for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+            if ( (Astore->fst_row + i) == Astore->colind[j] ) {  /* diagonal */
+                temp = aval[j] * c;
+                aval[j] = temp + one;
+            } else {
+                aval[j] *= c;
+	   }
+        }
+    }
+
+    return;
+}
+
+/*! \brief Scale and add: adds a scalar multiple of one matrix to another.
+ *  A_{i,j} = c * A_{i,j} + B_{i,j}$ for i,j=1,...,n
+ */
+void sScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, float c)
+{
+    NRformat_loc  *Astore = A->Store;
+    NRformat_loc  *Bstore = B->Store;
+    float *aval = (float *) Astore->nzval, *bval = (float *) Bstore->nzval;
+    int_t i;
+    float temp;
+
+    for (i = 0; i < Astore->nnz_loc; ++i) { /* Loop through each nonzero */
+        aval[i] = c * aval[i] + bval[i];
+    }
+
+    return;
+}
+/**** end utilities added for SUNDIALS ****/
+
+/*! \brief Allocate storage in ScalePermstruct */
+void sScalePermstructInit(const int_t m, const int_t n,
+                         sScalePermstruct_t *ScalePermstruct)
+{
+    ScalePermstruct->DiagScale = NOEQUIL;
+    if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) )
+        ABORT("Malloc fails for perm_r[].");
+    if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) )
+        ABORT("Malloc fails for perm_c[].");
+}
+
+/*! \brief Deallocate ScalePermstruct */
+void sScalePermstructFree(sScalePermstruct_t *ScalePermstruct)
+{
+    SUPERLU_FREE(ScalePermstruct->perm_r);
+    SUPERLU_FREE(ScalePermstruct->perm_c);
+    switch ( ScalePermstruct->DiagScale ) {
+      case ROW:
+        SUPERLU_FREE(ScalePermstruct->R);
+        break;
+      case COL:
+        SUPERLU_FREE(ScalePermstruct->C);
+        break;
+      case BOTH:
+        SUPERLU_FREE(ScalePermstruct->R);
+        SUPERLU_FREE(ScalePermstruct->C);
+        break;
+      default: break;
+    }
+}
+
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int sAllocGlu_3d(int_t n, int_t nsupers, sLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int sDeAllocGlu_3d(sLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int sDeAllocLlu_3d(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* sDeAllocLlu_3d */
+
+
+/**** Other utilities ****/
+void
+sGenXtrue_dist(int_t n, int_t nrhs, float *x, int_t ldx)
+{
+    int  i, j;
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) {
+	    if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n;
+	    else x[i + j*ldx] = 1.0 - (double)(i+1.)/n;
+	}
+}
+
+/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
+ */
+void
+sFillRHS_dist(char *trans, int_t nrhs, float *x, int_t ldx,
+	      SuperMatrix *A, float *rhs, int_t ldb)
+{
+    float one = 1.0;
+    float zero = 0.0;
+
+    sp_sgemm_dist(trans, nrhs, one, A, x, ldx, zero, rhs, ldb);
+
+}
+
+/*! \brief Fills a float precision array with a given value.
+ */
+void
+sfill_dist(float *a, int_t alen, float dval)
+{
+    register int_t i;
+    for (i = 0; i < alen; i++) a[i] = dval;
+}
+
+
+
+/*! \brief Check the inf-norm of the error vector
+ */
+void sinf_norm_error_dist(int_t n, int_t nrhs, float *x, int_t ldx,
+			  float *xtrue, int_t ldxtrue,
+                          gridinfo_t *grid)
+{
+    double err, xnorm;
+    float *x_work, *xtrue_work;
+    int i, j;
+
+    for (j = 0; j < nrhs; j++) {
+      x_work = &x[j*ldx];
+      xtrue_work = &xtrue[j*ldxtrue];
+      err = xnorm = 0.0;
+      for (i = 0; i < n; i++) {
+	err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
+	xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
+      }
+      err = err / xnorm;
+      printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+    }
+}
+
+void Printfloat5(char *name, int_t len, float *x)
+{
+    register int_t i;
+
+    printf("%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4);
+	printf("%14e", x[i]);
+    }
+    printf("\n");
+}
+
+int file_Printfloat5(FILE *fp, char *name, int_t len, float *x)
+{
+    register int_t i;
+
+    fprintf(fp, "%10s:", name);
+    for (i = 0; i < len; ++i) {
+	if ( i % 5 == 0 ) fprintf(fp, "\n[%ld-%ld] ", (long int) i, (long int) i+4);
+	fprintf(fp, "%14e", x[i]);
+    }
+    fprintf(fp, "\n");
+    return 0;
+}
+
+/*! \brief Print the blocks in the factored matrix L.
+ */
+void sPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
+		  Glu_persist_t *Glu_persist, sLocalLU_t *Llu)
+{
+    register int c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb;
+    register int_t k, mycol, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    float *nzval;
+
+    printf("\n[%d] L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam);
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    printf("[%d] block column %d (local # %d), nsupc %d, # row blocks %d\n",
+		   iam, gb, lb, nsupc, nb);
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+		printf("[%d] row-block %d: block # " IFMT "\tlength %d\n",
+		       iam, c, index[k], len);
+		PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]);
+		for (j = 0; j < nsupc; ++j) {
+		    Printfloat5("nzval", len, &nzval[r + j*nsupr]);
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+	printf("(%d)", iam);
+ 	PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
+	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
+    }
+    printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
+    k = CEILING( nsupers, grid->nprow );
+    PrintInt10("fmod", k, Llu->fmod);
+
+} /* SPRINTLBLOCKS */
+
+
+/*! \brief Sets all entries of matrix L to zero.
+ */
+void sZeroLblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    float zero = 0.0;
+    register int extra, gb, j, lb, nsupc, nsupr, ncb;
+    register int_t k, mycol, r;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    float *nzval;
+    int_t nsupers = Glu_persist->supno[n-1] + 1;
+
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (j = 0; j < nsupc; ++j) {
+                for (r = 0; r < nsupr; ++r) {
+                    nzval[r + j*nsupr] = zero;
+		}
+            }
+	}
+    }
+} /* end sZeroLblocks */
+
+
+/*! \brief Dump the factored matrix L using matlab triple-let format
+ */
+void sDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
+		  Glu_persist_t *Glu_persist, sLocalLU_t *Llu)
+{
+    register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    float *nzval;
+	char filename[256];
+	FILE *fp, *fopen();
+
+	// assert(grid->npcol*grid->nprow==1);
+
+	// count nonzeros in the first pass
+	nnzL = 0;
+	n = 0;
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+
+		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
+			n = nmax;
+		}
+
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam);
+    printf("Dumping L factor to --> %s\n", filename);
+ 	if ( !(fp = fopen(filename, "w")) ) {
+			ABORT("File open failed");
+		}
+
+	if(grid->iam==0){
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
+	}
+
+     ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, (double)iam);
+#if 0
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
+#endif
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+ 	fclose(fp);
+
+} /* sDumpLblocks */
+
+
+/*! \brief Print the blocks in the factored matrix U.
+ */
+void sPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
+		  Glu_persist_t *Glu_persist, sLocalLU_t *Llu)
+{
+    register int c, extra, jb, k, lb, len, nb, nrb, nsupc;
+    register int_t myrow, r;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    float *nzval;
+
+    printf("\n[%d] U BLOCKS IN ROW-MAJOR ORDER -->\n", iam);
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    nb = index[0];
+	    printf("[%d] block row " IFMT " (local # %d), # column blocks %d\n",
+		   iam, lb*grid->nprow+myrow, lb, nb);
+	    r  = 0;
+	    for (c = 0, k = BR_HEADER; c < nb; ++c) {
+		jb = index[k];
+		len = index[k+1];
+		printf("[%d] col-block %d: block # %d\tlength " IFMT "\n",
+		       iam, c, jb, index[k+1]);
+		nsupc = SuperSize( jb );
+		PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]);
+		Printfloat5("nzval", len, &nzval[r]);
+		k += UB_DESCRIPTOR + nsupc;
+		r += len;
+	    }
+
+	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
+	}
+    }
+} /* end sPrintUlocks */
+
+/*! \brief Sets all entries of matrix U to zero.
+ */
+void sZeroUblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    float zero = 0.0;
+    register int i, extra, lb, len, nrb;
+    register int myrow, r;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    float *nzval;
+    int nsupers = Glu_persist->supno[n-1] + 1;
+
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    len = index[1];  // number of entries in nzval[];
+	    for (i = 0; i < len; ++i) {
+	        nzval[i] = zero;
+	    }
+	}
+    }
+} /* end sZeroUlocks */
+
+int
+sprint_gsmv_comm(FILE *fp, int_t m_loc, psgsmv_comm_t *gsmv_comm,
+                 gridinfo_t *grid)
+{
+  int_t procs = grid->nprow*grid->npcol;
+  fprintf(fp, "TotalIndSend " IFMT "\tTotalValSend " IFMT "\n", gsmv_comm->TotalIndSend,
+	  gsmv_comm->TotalValSend);
+  file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start);
+  file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend);
+  file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv);
+  file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend);
+  file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv);
+  file_PrintInt32(fp, "SendCounts", procs, gsmv_comm->SendCounts);
+  file_PrintInt32(fp, "RecvCounts", procs, gsmv_comm->RecvCounts);
+  return 0;
+}
+
+
+void
+sGenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist,
+	    gridinfo_t *grid, float **xact, int *ldx, float **b, int *ldb)
+{
+    int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows,
+          nsupr, nsupers, rel;
+    int_t *supno, *xsup, *lxsup;
+    float *x, *bb;
+    NCformat *Astore;
+    float   *aval;
+
+    n = A->ncol;
+    *ldb = 0;
+    supno = Glu_persist->supno;
+    xsup = Glu_persist->xsup;
+    nsupers = supno[n-1] + 1;
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    Astore = (NCformat *) A->Store;
+    aval = Astore->nzval;
+    lb = CEILING( nsupers, grid->nprow ) + 1;
+    if ( !(lxsup = intMalloc_dist(lb)) )
+	ABORT("Malloc fails for lxsup[].");
+
+    lsup = 0;
+    nlrows = 0;
+    for (j = 0; j < nsupers; ++j) {
+	i = PROW( j, grid );
+	if ( myrow == i ) {
+	    nsupr = SuperSize( j );
+	    *ldb += nsupr;
+	    lxsup[lsup++] = nlrows;
+	    nlrows += nsupr;
+	}
+    }
+    *ldx = n;
+    if ( !(x = floatMalloc_dist(((size_t)*ldx) * nrhs)) )
+	ABORT("Malloc fails for x[].");
+    if ( !(bb = floatCalloc_dist(*ldb * nrhs)) )
+	ABORT("Calloc fails for bb[].");
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0;
+
+    /* Form b = A*x. */
+    for (j = 0; j < n; ++j)
+	for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+	    irow = Astore->rowind[i];
+	    gb = supno[irow];
+	    gbrow = PROW( gb, grid );
+	    if ( myrow == gbrow ) {
+		rel = irow - xsup[gb];
+		lb = LBi( gb, grid );
+		bb[lxsup[lb] + rel] += aval[i] * x[j];
+	    }
+	}
+
+    /* Memory allocated but not freed: xact, b */
+    *xact = x;
+    *b = bb;
+
+    SUPERLU_FREE(lxsup);
+
+#if ( PRNTlevel>=2 )
+    for (i = 0; i < grid->nprow*grid->npcol; ++i) {
+	if ( iam == i ) {
+	    printf("\n(%d)\n", iam);
+	    Printfloat5("rhs", *ldb, *b);
+	}
+	MPI_Barrier( grid->comm );
+    }
+#endif
+
+} /* GENXTRUERHS */
+
+/* g5.rua
+          b = A*x    y = L\b
+   0      1	     1.0000
+   1      0	     0.2500
+   2      1	     1.0000
+   3      2	     2.0000
+   4      1	     1.7500
+   5      1	     1.8917
+   6      0	     1.1879
+   7      2	     2.0000
+   8      2	     2.0000
+   9      1	     1.0000
+   10     1	     1.7500
+   11     0	          0
+   12     1	     1.8750
+   13     2	     2.0000
+   14     1	     1.0000
+   15     0	     0.2500
+   16     1	     1.7667
+   17     0	     0.6419
+   18     1	     2.2504
+   19     0	     1.1563
+   20     0	     0.9069
+   21     0	     1.4269
+   22     1	     2.7510
+   23     1	     2.2289
+   24     0	     2.4332
+
+   g6.rua
+       b=A*x  y=L\b
+    0    0         0
+    1    1    1.0000
+    2    1    1.0000
+    3    2    2.5000
+    4    0         0
+    5    2    2.0000
+    6    1    1.0000
+    7    1    1.7500
+    8    1    1.0000
+    9    0    0.2500
+   10    0    0.5667
+   11    1    2.0787
+   12    0    0.8011
+   13    1    1.9838
+   14    1    1.0000
+   15    1    1.0000
+   16    2    2.5000
+   17    0    0.8571
+   18    0         0
+   19    1    1.0000
+   20    0    0.2500
+   21    1    1.0000
+   22    2    2.0000
+   23    1    1.7500
+   24    1    1.8917
+   25    0    1.1879
+   26    0    0.8011
+   27    1    1.9861
+   28    1    2.0199
+   29    0    1.3620
+   30    0    0.6136
+   31    1    2.3677
+   32    0    1.1011
+   33    0    1.5258
+   34    0    1.7628
+   35    0    2.1658
+*/
diff --git a/SRC/symbfact.c b/SRC/symbfact.c
index 648853a1..6da8a99a 100644
--- a/SRC/symbfact.c
+++ b/SRC/symbfact.c
@@ -185,6 +185,7 @@ int_t symbfact
 
     if ( !pnum && (options->PrintStat == YES)) {
 	nnzLU = nnzL + nnzU - min_mn;				   
+	printf("\tMatrix size min_mn  " IFMT "\n", min_mn);
 	printf("\tNonzeros in L       " IFMT "\n", nnzL);
 	printf("\tNonzeros in U       " IFMT "\n", nnzU);
 	printf("\tnonzeros in L+U     " IFMT "\n", nnzLU);
@@ -528,8 +529,8 @@ static int_t column_dfs
 	     */
 	    lsub[nextl++] = krow; 	/* krow is indexed into A */
 	    if ( nextl >= nzlmax ) {
-		if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB,
-						   &nzlmax, Glu_freeable) )
+		if ( (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB,
+						    &nzlmax, Glu_freeable)) )
 		    return (mem_error);
 		lsub = Glu_freeable->lsub;
 	    }
@@ -570,10 +571,10 @@ static int_t column_dfs
 			    if ( chperm == EMPTY ) {
 				lsub[nextl++] = kchild;
 				if ( nextl >= nzlmax ) {
-				    if ( mem_error =
+				    if ( (mem_error =
 					symbfact_SubXpand(A->ncol, jcol, nextl,
 							  (MemType) LSUB, &nzlmax,
-							  Glu_freeable) )
+							  Glu_freeable)) )
 					return (mem_error);
 				    lsub = Glu_freeable->lsub;
 				}
@@ -786,8 +787,8 @@ static int_t set_usub
 
     new_next = nextu + nseg;
     while ( new_next > nzumax ) {
-	if (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax,
-					  Glu_freeable))
+	if ( (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax,
+					    Glu_freeable)) )
 	    return (mem_error);
 	usub = Glu_freeable->usub;
     }
diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c
new file mode 100644
index 00000000..dd677995
--- /dev/null
+++ b/SRC/treeFactorization.c
@@ -0,0 +1,446 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief factorization routines in 3D algorithms
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+
+ #include "superlu_ddefs.h"
+ #if 0
+ #include "treeFactorization.h"
+ #include "trfCommWrapper.h"
+ #endif
+
+ #if 0 /******** Sherry: Remove extra layer of function calls.  *******/
+ int_t sDiagFactIBCast(int_t k,  diagFactBufs_t *dFBuf,
+		       factStat_t *factStat,
+		       commRequests_t *comReqs,
+		       gridinfo_t *grid,
+		       superlu_dist_options_t *options,
+		       double thresh,
+		       LUstruct_t *LUstruct,
+		       SuperLUStat_t *stat, int *info,
+		       SCT_t *SCT,
+		       int tag_ub
+		      )
+ {
+     MPI_Request * U_diag_blk_recv_req  = comReqs->U_diag_blk_recv_req;
+     MPI_Request * L_diag_blk_recv_req  = comReqs->L_diag_blk_recv_req;
+     MPI_Request * U_diag_blk_send_req  = comReqs->U_diag_blk_send_req;
+     MPI_Request * L_diag_blk_send_req  = comReqs->L_diag_blk_send_req;
+     int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+
+     double * BlockUFactor =  dFBuf->BlockUFactor;
+     double * BlockLFactor =  dFBuf->BlockLFactor;
+     dDiagFactIBCast(k,  k,   BlockUFactor, BlockLFactor,
+		    IrecvPlcd_D,
+		    U_diag_blk_recv_req, L_diag_blk_recv_req,
+		    U_diag_blk_send_req, L_diag_blk_send_req,
+		    grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+     return 0;
+ }
+ int_t sLPanelUpdate( int_t k,  diagFactBufs_t *dFBuf,
+		      factStat_t *factStat,
+		      commRequests_t *comReqs,
+		      gridinfo_t *grid,
+		      LUstruct_t *LUstruct, SCT_t *SCT)
+ {
+     MPI_Request * U_diag_blk_recv_req  = comReqs->U_diag_blk_recv_req;
+     int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+     int_t * factored_L = factStat->factored_L;
+     double * BlockUFactor =  dFBuf->BlockUFactor;
+
+     dLPanelUpdate( k,  IrecvPlcd_D, factored_L,
+		    U_diag_blk_recv_req, BlockUFactor, grid, LUstruct, SCT);
+     return 0;
+ }
+
+ int_t sUPanelUpdate( int_t k,
+		      int_t ldt,
+		      diagFactBufs_t *dFBuf,
+		      factStat_t *factStat,
+		      commRequests_t *comReqs,
+		      scuBufs_t* scuBufs,
+		      packLUInfo_t* packLUInfo,
+		      gridinfo_t *grid,
+		      LUstruct_t *LUstruct,
+		      SuperLUStat_t *stat, SCT_t *SCT)
+ {
+     double* bigV = scuBufs->bigV;
+     Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+
+     MPI_Request * L_diag_blk_recv_req  = comReqs->L_diag_blk_recv_req;
+
+     int_t * factored_U = factStat->factored_U;
+
+     double * BlockLFactor =  dFBuf->BlockLFactor;
+     dUPanelUpdate(k, factored_U, L_diag_blk_recv_req, BlockLFactor, bigV, ldt,
+		   Ublock_info, grid, LUstruct, stat, SCT);
+     return 0;
+ }
+ int_t sIBcastRecvLPanel(
+     int_t k,
+     commRequests_t *comReqs,
+     LUValSubBuf_t* LUvsb,
+     msgs_t* msgs,
+     factStat_t *factStat,
+     gridinfo_t *grid,
+     LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub)
+
+ {
+     int* msgcnt  = msgs->msgcnt;
+    MPI_Request *send_req  = comReqs->send_req;
+    MPI_Request *recv_req  = comReqs->recv_req;
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    double * Lval_buf  = LUvsb->Lval_buf;
+    int_t* factored = factStat->factored;
+    dIBcastRecvLPanel(k, k,
+		      msgcnt, send_req, recv_req,
+		      Lsub_buf, Lval_buf, factored, grid, LUstruct, SCT, tag_ub);
+    return 0;
+}
+
+int_t sIBcastRecvUPanel(
+    int_t k,
+    commRequests_t *comReqs,
+    LUValSubBuf_t* LUvsb,
+    msgs_t* msgs,
+    factStat_t *factStat,
+    gridinfo_t *grid,
+    LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub)
+{
+    int* msgcnt  = msgs->msgcnt;
+    MPI_Request *send_requ  = comReqs->send_requ;
+    MPI_Request *recv_requ  = comReqs->recv_requ;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    dIBcastRecvUPanel(k, k, msgcnt, send_requ, recv_requ, Usub_buf,
+		      Uval_buf, grid, LUstruct, SCT, tag_ub);
+    return 0;
+}
+int_t sWaitL(int_t k,
+             commRequests_t *comReqs,
+             msgs_t* msgs,
+             gridinfo_t *grid,
+             LUstruct_t *LUstruct, SCT_t *SCT)
+{
+    int* msgcnt  = msgs->msgcnt;
+    int* msgcntU  = msgs->msgcntU;
+    MPI_Request *send_req  = comReqs->send_req;
+    MPI_Request *recv_req  = comReqs->recv_req;
+    dWaitL(k, msgcnt, msgcntU,  send_req, recv_req, grid, LUstruct, SCT);
+    return 0;
+}
+int_t sWaitU(int_t k,
+             commRequests_t *comReqs,
+             msgs_t* msgs,
+             gridinfo_t *grid,
+             LUstruct_t *LUstruct, SCT_t *SCT)
+{
+    int* msgcnt  = msgs->msgcnt;
+    MPI_Request *send_requ  = comReqs->send_requ;
+    MPI_Request *recv_requ  = comReqs->recv_requ;
+    dWaitU(k, msgcnt, send_requ, recv_requ, grid, LUstruct, SCT);
+    return 0;
+}
+int_t sWait_LUDiagSend(int_t k,  commRequests_t *comReqs,
+                       gridinfo_t *grid, SCT_t *SCT)
+{
+    MPI_Request *U_diag_blk_send_req  = comReqs->U_diag_blk_send_req;
+    MPI_Request *L_diag_blk_send_req  = comReqs->L_diag_blk_send_req;
+    Wait_LUDiagSend(k, U_diag_blk_send_req, L_diag_blk_send_req,  grid, SCT);
+    return 0;
+}
+int_t sSchurComplementSetup(int_t k, msgs_t* msgs,
+                            packLUInfo_t* packLUInfo,
+                            int_t* gIperm_c_supno, int_t*perm_c_supno,
+                            factNodelists_t* fNlists,
+                            scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb,
+                            gridinfo_t *grid, LUstruct_t *LUstruct)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    double * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+    Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+
+    double* bigU = scuBufs->bigU;
+    return dSchurComplementSetup(k, msgcnt,
+				 Ublock_info, Remain_info, uPanelInfo, lPanelInfo,
+				 gIperm_c_supno, iperm_u, perm_u,
+				 bigU, Lsub_buf, Lval_buf, Usub_buf, Uval_buf,
+				 grid, LUstruct);
+}
+int_t sLPanelTrSolve( int_t k,  diagFactBufs_t *dFBuf,
+                      factStat_t *factStat,
+                      commRequests_t *comReqs,
+                      gridinfo_t *grid,
+                      LUstruct_t *LUstruct, SCT_t *SCT)
+{
+    int_t * factored_L = factStat->factored_L;
+    double * BlockUFactor =  dFBuf->BlockUFactor;
+    dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct);
+
+    return 0;
+}
+
+int_t sUPanelTrSolve( int_t k,
+                      int_t ldt,
+                      diagFactBufs_t *dFBuf,
+                      scuBufs_t* scuBufs,
+                      packLUInfo_t* packLUInfo,
+                      gridinfo_t *grid,
+                      LUstruct_t *LUstruct,
+                      SuperLUStat_t *stat, SCT_t *SCT)
+{
+    double* bigV = scuBufs->bigV;
+    Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+    double * BlockLFactor =  dFBuf->BlockLFactor;
+
+    dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, LUstruct, stat, SCT);
+    return 0;
+}
+
+#endif /******** End removing extra layer of function calls.  *******/
+
+
+int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid)
+{
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    // allocating MPI requests  (for one)
+    comReqs->U_diag_blk_recv_req = MPI_REQ_ALLOC( 1 );
+    comReqs->L_diag_blk_recv_req = MPI_REQ_ALLOC( 1 );
+    comReqs->U_diag_blk_send_req = MPI_REQ_ALLOC( Pr );
+    comReqs->L_diag_blk_send_req = MPI_REQ_ALLOC( Pc );
+    comReqs->send_req = MPI_REQ_ALLOC(2 * Pc);
+    comReqs->recv_req = MPI_REQ_ALLOC(4);
+    comReqs->send_requ = MPI_REQ_ALLOC(2 * Pr);
+    comReqs->recv_requ =  MPI_REQ_ALLOC(2);
+    return 0;
+}
+
+commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    commRequests_t** comReqss;
+    comReqss = (commRequests_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(commRequests_t*));
+    for (int i = 0; i < mxLeafNode; ++i)
+	{
+	    /* code */
+	    comReqss[i] = (commRequests_t* ) SUPERLU_MALLOC(sizeof(commRequests_t));
+	    initCommRequests(comReqss[i], grid);
+	}/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+    return comReqss;
+}
+
+// sherry added
+int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(comReqss[i]->U_diag_blk_recv_req);
+	SUPERLU_FREE(comReqss[i]->L_diag_blk_recv_req);
+	SUPERLU_FREE(comReqss[i]->U_diag_blk_send_req);
+	SUPERLU_FREE(comReqss[i]->L_diag_blk_send_req);
+	SUPERLU_FREE(comReqss[i]->send_req);
+	SUPERLU_FREE(comReqss[i]->recv_req);
+	SUPERLU_FREE(comReqss[i]->send_requ);
+	SUPERLU_FREE(comReqss[i]->recv_requ);
+	SUPERLU_FREE(comReqss[i]);
+    }
+    SUPERLU_FREE(comReqss);
+    return 0;
+}
+
+int_t initFactStat(int_t nsupers, factStat_t* factStat)
+{
+    factStat->IrecvPlcd_D = intMalloc_dist( nsupers);
+    factStat->factored_D = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers);
+    factStat->factored_L = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers);
+    factStat->factored_U = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers);
+    factStat->factored = intMalloc_dist( nsupers);   //INT_T_ALLOC( nsupers);
+    factStat->IbcastPanel_L = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers);
+    factStat->IbcastPanel_U = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers);
+    factStat->gpuLUreduced = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers);
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        factStat->IrecvPlcd_D[i] = 0;
+        factStat->factored_D[i] = 0;
+        factStat->factored_L[i] = 0;
+        factStat->factored_U[i] = 0;
+        factStat->IbcastPanel_L[i] = 0;
+        factStat->IbcastPanel_U[i] = 0;
+        factStat->gpuLUreduced[i] = 0;
+    }
+    return 0;
+}
+
+int freeFactStat(factStat_t* factStat)
+{
+    SUPERLU_FREE(factStat->IrecvPlcd_D);
+    SUPERLU_FREE(factStat->factored_D);
+    SUPERLU_FREE(factStat->factored_L);
+    SUPERLU_FREE(factStat->factored_U);
+    SUPERLU_FREE(factStat->factored);
+    SUPERLU_FREE(factStat->IbcastPanel_L);
+    SUPERLU_FREE(factStat->IbcastPanel_U);
+    SUPERLU_FREE(factStat->gpuLUreduced);
+    return 0;
+}
+
+int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers,
+			factNodelists_t* fNlists)
+{
+    fNlists->iperm_u = INT_T_ALLOC(nsupers);
+    fNlists->perm_u = INT_T_ALLOC(nsupers);
+#if 0 // Sherry: change to int type
+    fNlists->indirect = INT_T_ALLOC(num_threads * ldt);
+    fNlists->indirect2 = INT_T_ALLOC(num_threads * ldt);
+#else
+    fNlists->indirect = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int));
+    fNlists->indirect2 = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int));
+#endif    
+    return 0;
+}
+
+int freeFactNodelists(factNodelists_t* fNlists)
+{
+    SUPERLU_FREE(fNlists->iperm_u);
+    SUPERLU_FREE(fNlists->perm_u);
+    SUPERLU_FREE(fNlists->indirect);
+    SUPERLU_FREE(fNlists->indirect2);
+    return 0;
+}
+
+int_t initMsgs(msgs_t* msgs)
+{
+    msgs->msgcnt = (int *) SUPERLU_MALLOC(4 * sizeof(int));
+    msgs->msgcntU = (int *) SUPERLU_MALLOC(4 * sizeof(int));
+    return 0;
+}
+
+msgs_t** initMsgsArr(int_t numLA)
+{
+    msgs_t**msgss = (msgs_t**) SUPERLU_MALLOC(numLA * sizeof(msgs_t*));
+    for (int_t i = 0; i < numLA; ++i)
+	{
+	    /* code */
+	    msgss[i] = (msgs_t*) SUPERLU_MALLOC(sizeof(msgs_t));
+	    initMsgs(msgss[i]);
+	} /*minor for loop-3 for (int i = 0; i < numLA; ++i)*/
+    return msgss;
+}
+
+// sherry added
+int freeMsgsArr(int_t numLA, msgs_t **msgss)
+{
+    for (int i = 0; i < numLA; ++i) {
+        SUPERLU_FREE(msgss[i]->msgcnt);
+        SUPERLU_FREE(msgss[i]->msgcntU);
+	SUPERLU_FREE(msgss[i]);
+    }
+    SUPERLU_FREE(msgss);
+    return 0;
+}
+
+int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo)
+{
+    packLUInfo->Ublock_info =  (Ublock_info_t*) SUPERLU_MALLOC (sizeof(Ublock_info_t) * nsupers);
+    packLUInfo->Remain_info = (Remain_info_t* ) SUPERLU_MALLOC(sizeof(Remain_info_t) * nsupers);
+    packLUInfo->uPanelInfo = (uPanelInfo_t* ) SUPERLU_MALLOC(sizeof(uPanelInfo_t));
+    packLUInfo->lPanelInfo = (lPanelInfo_t*) SUPERLU_MALLOC(sizeof(lPanelInfo_t));
+    return 0;
+}
+
+int freePackLUInfo(packLUInfo_t* packLUInfo)  // sherry added 
+{
+    SUPERLU_FREE(packLUInfo->Ublock_info);
+    SUPERLU_FREE(packLUInfo->Remain_info);
+    SUPERLU_FREE(packLUInfo->uPanelInfo);
+    SUPERLU_FREE(packLUInfo->lPanelInfo);
+    return 0;
+}
+
+int_t getNumLookAhead(superlu_dist_options_t *options)
+{
+    int_t numLA;
+    if (getenv("NLULA"))
+    {
+        numLA = atoi(getenv("NLULA"));
+    }
+    else
+    {
+        // printf("NLULA not set using default 2\n");
+	// numLA = 2;
+	numLA = options->num_lookaheads;
+    }
+    return numLA;
+}
+
+int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
+                     gridinfo_t *grid, SCT_t *SCT)
+{
+
+    MPI_Request * U_diag_blk_recv_req  = comReqs->U_diag_blk_recv_req;
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (mycol == kcol  && iam != pkk)
+    {
+        int_t flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        return flag;
+    }
+
+    return 1;
+}
+
+int_t checkRecvLDiag(int_t k,
+                     commRequests_t *comReqs,
+                     gridinfo_t *grid,
+                     SCT_t *SCT)
+{
+    MPI_Request * L_diag_blk_recv_req  = comReqs->L_diag_blk_recv_req;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t flag = 0;
+
+        flag = Test_LDiagBlock_Recv( L_diag_blk_recv_req , SCT);
+
+        return flag;
+    }
+    return 1;
+}
+
+
diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c
new file mode 100644
index 00000000..066edcab
--- /dev/null
+++ b/SRC/treeFactorizationGPU.c
@@ -0,0 +1,716 @@
+// #include "treeFactorization.h"
+// #include "trfCommWrapper.h"
+#include "dlustruct_gpu.h"
+
+/* 
+/-- num_u_blks--\ /-- num_u_blks_Phi --\
+----------------------------------------
+|  host_cols    ||    GPU   |   host   |
+----------------------------------------
+                  ^          ^
+                  0          jj_cpu
+*/
+static int_t getAccUPartition(HyP_t *HyP)
+{
+    /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
+    int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                        estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu);
+
+    int jj_cpu;
+
+#if 0 /* Ignoe those estimates */
+    jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
+                                   HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
+#else /* Sherry: new */
+    jj_cpu = HyP->num_u_blks_Phi;
+#endif
+
+    if (jj_cpu != 0 && HyP->Rnbrow > 0) // ###
+    {
+        HyP->offloadCondition = 1;
+    }
+    else
+    {
+        HyP->offloadCondition = 0;
+        jj_cpu = 0; // ###
+    }
+
+    return jj_cpu;
+}
+
+int dsparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    scuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    dLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    diagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int_t ldt,
+    sluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info)
+{
+    // sforest.nNodes, sforest.nodeList,
+    // &sforest.topoInfo,
+    int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
+    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
+    int_t *myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t *grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t *IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t *factored_D = factStat->factored_D;
+    int_t *factored_L = factStat->factored_L;
+    int_t *factored_U = factStat->factored_U;
+    int_t *IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t *IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t *gpuLUreduced = factStat->gpuLUreduced;
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+
+    // int_t numLAMax = getNumLookAhead();
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax; // number of look-ahead panels
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
+    int_t last_flag = 1;                       /* for updating nsuper-1 only once */
+    int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams
+
+    if (superlu_acc_offload)
+        syncAllfunCallStreams(sluGPU, SCT);
+
+    /* Go through each leaf node */
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+
+        /* If LU panels from GPU are not reduced, then reduce
+	   them before diagonal factorization */
+        if (!gpuLUreduced[k] && superlu_acc_offload)
+        {
+            double tt_start1 = SuperLU_timer_();
+
+            initD2Hreduce(k, d2Hred, last_flag,
+                          HyP, sluGPU, grid, LUstruct, SCT);
+            int_t copyL_kljb = d2Hred->copyL_kljb;
+            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+            if (copyL_kljb || copyU_kljb)
+                SCT->PhiMemCpyCounter++;
+            sendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+
+            reduceGPUlu(last_flag, d2Hred,
+                        sluGPU, SCT, grid, LUstruct);
+
+            gpuLUreduced[k] = 1;
+            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+        }
+
+        double t1 = SuperLU_timer_();
+
+        /*Now factor and broadcast diagonal block*/
+        // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+        //                 options, thresh, LUstruct, stat, info, SCT);
+
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+        dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                        factStat->IrecvPlcd_D,
+                        comReqss[offset]->U_diag_blk_recv_req,
+                        comReqss[offset]->L_diag_blk_recv_req,
+                        comReqss[offset]->U_diag_blk_send_req,
+                        comReqss[offset]->L_diag_blk_send_req,
+                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+
+        SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+    } /* for all leaves ... */
+
+    //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
+
+    /* Process supernodal etree level by level */
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    {
+        //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+
+        /* Process all the nodes in 'topoLvl': diagonal factorization */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            if (!factored_D[k])
+            {
+                /*If LU panels from GPU are not reduced then reduce
+		  them before diagonal factorization*/
+                if (!gpuLUreduced[k] && superlu_acc_offload)
+                {
+                    double tt_start1 = SuperLU_timer_();
+                    initD2Hreduce(k, d2Hred, last_flag,
+                                  HyP, sluGPU, grid, LUstruct, SCT);
+                    int_t copyL_kljb = d2Hred->copyL_kljb;
+                    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                    if (copyL_kljb || copyU_kljb)
+                        SCT->PhiMemCpyCounter++;
+                    sendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+                    /*
+                        Reduce the LU panels from GPU
+                    */
+                    reduceGPUlu(last_flag, d2Hred,
+                                sluGPU, SCT, grid, LUstruct);
+
+                    gpuLUreduced[k] = 1;
+                    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                }
+
+                double t1 = SuperLU_timer_();
+                /* Factor diagonal block on CPU */
+                // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                //                 options, thresh, LUstruct, stat, info, SCT);
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+                dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                                factStat->IrecvPlcd_D,
+                                comReqss[offset]->U_diag_blk_recv_req,
+                                comReqss[offset]->L_diag_blk_recv_req,
+                                comReqss[offset]->U_diag_blk_send_req,
+                                comReqss[offset]->L_diag_blk_send_req,
+                                grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+            }
+        } /* for all nodes in this level */
+
+        //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+
+        double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
+
+        /* Process all the nodes in 'topoLvl': panel updates on CPU */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+                dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+                              comReqss[offset]->U_diag_blk_recv_req,
+                              dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+                dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+                              dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+                              packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        } /* end panel update */
+
+        //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+
+        /* Process all the panels in look-ahead window: 
+	   broadcast L and U panels. */
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+            {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+                                  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+                                  LUvsbs[offset]->Lval_buf, factStat->factored,
+                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                                  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+                                  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        } /* end for panels in look-ahead window */
+
+        //printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
+
+        /* Process all the nodes in level 'topoLvl': Schur complement update
+	   (no MPI communication)  */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+            double tsch = SuperLU_timer_();
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+            dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU,
+                   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                   grid, LUstruct, SCT);
+            dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+
+            int_t LU_nonempty = dSchurComplementSetupGPU(k,
+                                                         msgss[offset], packLUInfo,
+                                                         myIperm, gIperm_c_supno, perm_c_supno,
+                                                         gEtreeInfo, fNlists, scuBufs,
+                                                         LUvsbs[offset], grid, LUstruct, HyP);
+            // initializing D2H data transfer. D2H = Device To Host.
+            int_t jj_cpu; /* limit between CPU and GPU */
+
+#if 1
+            if (superlu_acc_offload)
+            {
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                /* code */
+                HyP->offloadCondition = 0;
+                jj_cpu = 0;
+            }
+
+#else
+            if (superlu_acc_offload)
+            {
+                jj_cpu = getAccUPartition(HyP);
+
+                if (jj_cpu > 0)
+                    jj_cpu = HyP->num_u_blks_Phi;
+
+                /* Sherry force this --> */
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                jj_cpu = 0;
+            }
+#endif
+
+            // int_t jj_cpu = HyP->num_u_blks_Phi-1;
+            // if (HyP->Rnbrow > 0 && jj_cpu>=0)
+            //     HyP->offloadCondition = 1;
+            // else
+            //     HyP->offloadCondition = 0;
+            //     jj_cpu=0;
+#if 0
+	    if ( HyP->offloadCondition ) {
+	    printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n",
+		   grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi ,
+		   HyP->num_u_blks, HyP->num_u_blks_Phi,
+		   jj_cpu, HyP->offloadCondition);
+	    fflush(stdout);
+	    }
+#endif
+            scuStatUpdate(SuperSize(k), HyP, SCT, stat);
+
+            int_t offload_condition = HyP->offloadCondition;
+            uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int_t *indirect = fNlists->indirect;
+            int_t *indirect2 = fNlists->indirect2;
+
+            /* Schur Complement Update */
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC(k + 1);
+
+            double *bigV = scuBufs->bigV;
+            double *bigU = scuBufs->bigU;
+
+            double t1 = SuperLU_timer_();
+
+#pragma omp parallel /* Look-ahead update on CPU */
+            {
+                int_t thread_id = omp_get_thread_num();
+
+#pragma omp for
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub,
+                                               usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#pragma omp for
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub,
+                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#pragma omp for
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->RemainBlk;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+            }     /* end parallel region ... end look-ahead update */
+
+            SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+
+            //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+
+            /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1)
+            { /* Not the root */
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
+                { /* if k is the last child in this level */
+                    int_t k0_parent = myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+                        //	      exit(-1);
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+                        if (!gpuLUreduced[k_parent] && superlu_acc_offload)
+                        {
+                            double tt_start1 = SuperLU_timer_();
+
+                            initD2Hreduce(k_parent, d2Hred, last_flag,
+                                          HyP, sluGPU, grid, LUstruct, SCT);
+                            int_t copyL_kljb = d2Hred->copyL_kljb;
+                            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                            if (copyL_kljb || copyU_kljb)
+                                SCT->PhiMemCpyCounter++;
+                            sendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU);
+
+                            /* Reduce the LU panels from GPU */
+                            reduceGPUlu(last_flag, d2Hred,
+                                        sluGPU, SCT, grid, LUstruct);
+
+                            gpuLUreduced[k_parent] = 1;
+                            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                        }
+
+                        /* Factorize diagonal block on CPU */
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+                        dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+                                        dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+                                        comReqss[offset]->U_diag_blk_recv_req,
+                                        comReqss[offset]->L_diag_blk_recv_req,
+                                        comReqss[offset]->U_diag_blk_send_req,
+                                        comReqss[offset]->L_diag_blk_send_req,
+                                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    } /* end if k0_parent > 0 */
+
+                } /* end if all children are done */
+            }     /* end if non-root */
+
+#pragma omp parallel
+            {
+                /* Master thread performs Schur complement update on GPU. */
+#pragma omp master
+                {
+                    if (superlu_acc_offload)
+                    {
+                        int_t thread_id = omp_get_thread_num();
+                        double t1 = SuperLU_timer_();
+
+                        if (offload_condition)
+                        {
+                            SCT->datatransfer_count++;
+                            int_t streamId = k0 % nGPUStreams;
+
+                            /*wait for previous offload to get finished*/
+                            if (sluGPU->lastOffloadStream[streamId] != -1)
+                            {
+                                waitGPUscu(streamId, sluGPU, SCT);
+                                sluGPU->lastOffloadStream[streamId] = -1;
+                            }
+
+                            int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow;
+                            int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
+                            assert(bigu_send_size < HyP->bigu_size);
+
+                            /* !! Sherry add the test to avoid seg_fault inside sendSCUdataHost2GPU */
+                            if (bigu_send_size > 0)
+                            {
+                                sendSCUdataHost2GPU(streamId, lsub, usub, bigU, bigu_send_size,
+                                                    Remain_lbuf_send_size, sluGPU, HyP);
+
+                                sluGPU->lastOffloadStream[streamId] = k0;
+                                int_t usub_len = usub[2];
+                                int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+                                //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+                                SchurCompUpdate_GPU(
+                                    streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                    Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                    HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid);
+                            } /* endif bigu_send_size > 0 */
+
+                            // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
+
+                            SCT->schurPhiCallCount++;
+                            HyP->jj_cpu = jj_cpu;
+                            updateDirtyBit(k0, HyP, grid);
+                        } /* endif (offload_condition) */
+
+                        double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double)(t2 - t1);                                     // Sherry added
+
+                    } /* endif (superlu_acc_offload) */
+
+                } /* end omp master thread */
+
+#pragma omp for
+                /* The following update is on CPU. Should not be necessary now,
+		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
+                {
+                    //printf(".. WARNING: should NOT get here\n");
+                    int_t j = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+
+            } /* end omp parallel region */
+
+            //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+            Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req,
+                            comReqss[abs_offset]->L_diag_blk_send_req,
+                            grid, SCT);
+#endif
+
+            /*Schedule next I bcasts within look-ahead window */
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                                      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+                                      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+                                      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf,
+                                      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            } /* end for look-ahead window */
+
+            if (topoLvl < maxTopoLevel - 1) /* not root */
+            {
+                /*look-ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+                            dLPanelTrSolve(kx, factStat->factored_L,
+                                           dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within look-ahead window
+                                factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt,
+                                                  comReqss[offset1]->send_req,
+                                                  comReqss[offset1]->recv_req,
+                                                  LUvsbs[offset1]->Lsub_buf,
+                                                  LUvsbs[offset1]->Lval_buf,
+                                                  factStat->factored,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+                            dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor,
+                                           scuBufs->bigV,
+                                           ldt, packLUInfo->Ublock_info,
+                                           grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within lookahead window
+                                factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt,
+                                                  comReqss[offset]->send_requ,
+                                                  comReqss[offset]->recv_requ,
+                                                  LUvsbs[offset]->Usub_buf,
+                                                  LUvsbs[offset]->Uval_buf,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                } /* end look-ahead */
+
+            } /* end if non-root level */
+
+            /* end Schur complement update */
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+        } /* end Schur update for all the nodes in level 'topoLvl' */
+
+    } /* end for all levels of the tree */
+
+    return 0;
+} /* end sparseTreeFactor_ASYNC_GPU */
diff --git a/SRC/trfAux.c b/SRC/trfAux.c
new file mode 100644
index 00000000..ff96d1b3
--- /dev/null
+++ b/SRC/trfAux.c
@@ -0,0 +1,1280 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Auxiliary routines to support 3D algorithms
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * </pre>
+ */
+
+#include "superlu_ddefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+
+int_t getslu25D_enabled()
+{
+    if ( getenv("SLU25D") != NULL)
+    {
+        return atoi(getenv("SLU25D"));
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+int getNsupers(int n, Glu_persist_t *Glu_persist)
+{
+    int nsupers = Glu_persist->supno[n - 1] + 1;
+    return nsupers;
+}
+
+int set_tag_ub()
+{
+    void *attr_val;
+    int flag;
+    MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag);
+    if (!flag)
+    {
+        fprintf (stderr, "Could not get TAG_UB\n");
+        exit(-1);
+    }
+    return ( *(int_t *) attr_val );
+}
+
+int getNumThreads(int iam)
+{
+    int num_threads = 1;
+#ifdef _OPENMP
+    #pragma omp parallel default(shared)
+    {
+        #pragma omp master
+        {
+            num_threads = omp_get_num_threads ();
+
+        }
+    }
+#endif
+
+    if (!iam)
+    {
+        printf(".. Starting with %d openMP threads \n", num_threads );
+
+    }
+    return num_threads;
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr)
+//LUstruct_t *LUstruct)
+{
+
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            //int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            int_t* lsub = Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+
+    int_t bigu_size =
+        8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+
+    return bigu_size;
+}
+
+int_t* getFactPerm(int_t nsupers)
+{
+    int_t* perm = INT_T_ALLOC(nsupers);
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        perm[i] = i;
+    }
+
+    return perm;
+}
+
+int_t* getFactIperm(int_t* perm, int_t nsupers)
+{
+    int_t* iperm = INT_T_ALLOC(nsupers);
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        iperm[perm[i]] = i;
+    }
+
+    return iperm;
+}
+
+int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *options,
+		       int_t *etree, Glu_persist_t *Glu_persist,
+		       int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+		       gridinfo_t *grid)
+
+{
+    /*I do not understand the following code in detail,
+    I have just written a wrapper around it*/
+
+    int_t* perm_c_supno;
+    //Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    //LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, *Lblock, *Lrows,
+          *sf_block, *sf_block_l, *nnodes_l, *nnodes_u, *edag_supno_l, *recvbuf,
+          **edag_supno;
+    int_t  i, ib,  jb,
+           lb,
+           nlb, il, iu;
+    int  ncb, nrb, p, pr, pc, nblocks;
+    int_t *index;
+    int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows;
+    int_t j, k, krow,  yourcol;
+    etree_node *head, *tail, *ptr;
+    int *num_child;
+    nblocks = 0;
+    ncb = nsupers / Pc;
+    nrb = nsupers / Pr;
+    /* ================================================== *
+     * static scheduling of j-th step of LU-factorization *
+     * ================================================== */
+    if ( options->lookahead_etree == YES && /* use e-tree of symmetrized matrix, and      */
+            (options->ParSymbFact == NO ||      /* 1) symmetric fact with serial symbolic, or */
+             (options->SymPattern == YES &&      /* 2) symmetric pattern, and                  */
+              options->RowPerm == NOROWPERM)) )  /*    no rowperm to destroy the symmetry      */
+    {
+        /* if symmetric pattern or using e-tree of |A^T|+|A|,
+        then we can use a simple tree structure for static schduling */
+
+        if ( options->ParSymbFact == NO )
+        {
+            /* Use the etree computed from serial symb. fact., and turn it
+            into supernodal tree.  */
+            //int_t *etree = LUstruct->etree;
+#if ( PRNTlevel>=1 )
+            if ( grid->iam == 0 ) printf( " === using column e-tree ===\n" );
+#endif
+
+            /* look for the first off-diagonal blocks */
+            etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+            for ( i = 0; i < nsupers; i++ ) etree_supno[i] = nsupers;
+            for ( j = 0, lb = 0; lb < nsupers; lb++ )
+            {
+                for ( k = 0; k < SuperSize(lb); k++ )
+                {
+                    jb = Glu_persist->supno[etree[j + k]];
+                    if ( jb != lb ) etree_supno[lb] = SUPERLU_MIN( etree_supno[lb], jb );
+                }
+                j += SuperSize(lb);
+            }
+        }
+        else     /* ParSymbFACT==YES and SymPattern==YES  and RowPerm == NOROWPERM */
+        {
+            /* Compute an "etree" based on struct(L),
+            assuming struct(U) = struct(L').   */
+#if ( PRNTlevel>=1 )
+            if ( grid->iam == 0 ) printf( " === using supernodal e-tree ===\n" );
+#endif
+
+            /* find the first block in each supernodal-column of local L-factor */
+            etree_supno_l = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+            for ( i = 0; i < nsupers; i++ ) etree_supno_l[i] = nsupers;
+            for ( lb = 0; lb < ncb; lb++ )
+            {
+                jb = lb * grid->npcol + mycol;
+                //index = Llu->Lrowind_bc_ptr[lb];
+                index = Lrowind_bc_ptr[lb];
+                if ( index )   /* Not an empty column */
+                {
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW( jb, grid );
+                    if ( krow == myrow )  /* skip the diagonal block */
+                    {
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if ( i > 0 )
+                    {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i --;
+                    }
+
+                    for ( j = 0; j < i; j++ )
+                    {
+                        etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] );
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+            if ( mycol < nsupers % grid->npcol )
+            {
+                jb = ncb * grid->npcol + mycol;
+                //index = Llu->Lrowind_bc_ptr[ncb];
+                index = Lrowind_bc_ptr[ncb];
+                if ( index )   /* Not an empty column */
+                {
+                    i = index[0];
+                    k = BC_HEADER;
+                    krow = PROW( jb, grid );
+                    if ( krow == myrow )  /* skip the diagonal block */
+		    {
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i--;
+                    }
+                    if ( i > 0 )
+                    {
+                        etree_supno_l[jb] = index[k];
+                        k += LB_DESCRIPTOR + index[k + 1];
+                        i --;
+                    }
+                    for ( j = 0; j < i; j++ )
+                    {
+                        etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] );
+                        k += LB_DESCRIPTOR + index[k + 1];
+                    }
+                }
+            }
+
+            /* form global e-tree */
+            etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+            MPI_Allreduce( etree_supno_l, etree_supno, nsupers, mpi_int_t, MPI_MIN, grid->comm );
+            SUPERLU_FREE(etree_supno_l);
+        }
+
+        /* initialize the num of child for each node */
+        num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+        for ( i = 0; i < nsupers; i++ ) num_child[i] = 0;
+        for ( i = 0; i < nsupers; i++ ) if ( etree_supno[i] != nsupers ) num_child[etree_supno[i]] ++;
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for ( i = 0; i < nsupers; i++ )
+        {
+            if ( num_child[i] == 0 )
+            {
+                ptr = SUPERLU_MALLOC( sizeof(etree_node) );
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/
+                nnodes ++;
+
+                if ( nnodes == 1 )
+                {
+                    head = ptr;
+                    tail = ptr;
+                }
+                else
+                {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+        perm_c_supno  = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+        while ( nnodes > 0 )
+        {
+            ptr = head;  j = ptr->id;
+            head = ptr->next;
+            perm_c_supno[i] = j;
+            SUPERLU_FREE(ptr);
+            i++; nnodes --;
+
+            if ( etree_supno[j] != nsupers )
+            {
+                num_child[etree_supno[j]] --;
+                if ( num_child[etree_supno[j]] == 0 )
+                {
+                    nnodes ++;
+
+                    ptr = SUPERLU_MALLOC( sizeof(etree_node) );
+                    ptr->id = etree_supno[j];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id );*/
+                    if ( nnodes == 1 )
+                    {
+                        head = ptr;
+                        tail = ptr;
+                    }
+                    else
+                    {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" );*/
+        }
+        SUPERLU_FREE(num_child);
+        SUPERLU_FREE(etree_supno);
+
+    }
+    else     /* Unsymmetric pattern */
+    {
+        /* Need to process both L- and U-factors, use the symmetrically
+        pruned graph of L & U instead of tree (very naive implementation) */
+        int nrbp1 = nrb + 1;
+
+        /* allocate some workspace */
+        if ( !(sendcnts = SUPERLU_MALLOC( (4 + 2 * nrbp1) * Pr * Pc * sizeof(int))) )
+            ABORT("Malloc fails for sendcnts[].");
+        sdispls  = &sendcnts[Pr * Pc];
+        recvcnts = &sdispls [Pr * Pc];
+        rdispls  = &recvcnts[Pr * Pc];
+        srows    = &rdispls [Pr * Pc];
+        rrows    = &srows   [Pr * Pc * nrbp1];
+
+        myrow = MYROW( iam, grid );
+#if ( PRNTlevel>=1 )
+        if ( grid->iam == 0 ) printf( " === using DAG ===\n" );
+#endif
+
+        /* send supno block of local U-factor to a processor *
+        * who owns the corresponding block of L-factor      */
+
+        /* srows   : # of block to send to a processor from each supno row */
+        /* sendcnts: total # of blocks to send to a processor              */
+        for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0;
+        for (p = 0; p < Pr * Pc; p++ ) sendcnts[p] = 0;
+
+        /* sending blocks of U-factors corresponding to L-factors */
+        /* count the number of blocks to send */
+        for (lb = 0; lb < nrb; ++lb)
+        {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            //index = Llu->Ufstnz_br_ptr[lb];
+            index = Ufstnz_br_ptr[lb];
+
+            if ( index )   /* Not an empty row */
+            {
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j)
+                {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p  = pr * Pc + pc;
+                    sendcnts[p] ++;
+                    srows[p * nrbp1 + lb] ++;
+
+                    k += UB_DESCRIPTOR + SuperSize( index[k] );
+                }
+            }
+        }
+        if ( myrow < nsupers % grid->nprow )
+        {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            //index = Llu->Ufstnz_br_ptr[nrb];
+	    index = Ufstnz_br_ptr[nrb];
+
+            if ( index )   /* Not an empty row */
+            {
+                k = BR_HEADER;
+                nblocks += index[0];
+                for (j = 0; j < index[0]; ++j)
+                {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p  = pr * Pc + pc;
+                    sendcnts[p] ++;
+                    srows[p * nrbp1 + nrb] ++;
+                    k += UB_DESCRIPTOR + SuperSize( index[k] );
+                }
+            }
+        }
+
+        /* insert blocks to send */
+        sdispls[0] = 0;
+        for ( p = 1; p < Pr * Pc; p++ ) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+        if ( !(blocks = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for blocks[].");
+        for (lb = 0; lb < nrb; ++lb)
+        {
+            jb = lb * Pr + myrow;
+            pc = jb % Pc;
+            //index = Llu->Ufstnz_br_ptr[lb];
+            index = Ufstnz_br_ptr[lb];
+
+            if ( index )   /* Not an empty row */
+            {
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j)
+                {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p  = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p] ++;
+
+                    k += UB_DESCRIPTOR + SuperSize( index[k] );
+                }
+            }
+        }
+        if ( myrow < nsupers % grid->nprow )
+        {
+            jb = nrb * Pr + myrow;
+            pc = jb % Pc;
+            //index = Llu->Ufstnz_br_ptr[nrb];
+            index = Ufstnz_br_ptr[nrb];
+
+            if ( index )   /* Not an empty row */
+            {
+                k = BR_HEADER;
+                for (j = 0; j < index[0]; ++j)
+                {
+                    ib = index[k];
+                    pr = ib % Pr;
+                    p  = pr * Pc + pc;
+                    blocks[sdispls[p]] = ib;
+                    sdispls[p] ++;
+
+                    k += UB_DESCRIPTOR + SuperSize( index[k] );
+                }
+            }
+        }
+
+        /* communication */
+        MPI_Alltoall( sendcnts,  1, MPI_INT, recvcnts,   1, MPI_INT, grid->comm );
+        MPI_Alltoall( srows, nrbp1, MPI_INT, rrows,  nrbp1, MPI_INT, grid->comm );
+
+        nblocks = recvcnts[0];
+        rdispls[0] = sdispls[0] = 0;
+        for ( p = 1; p < Pr * Pc; p++ )
+        {
+            rdispls[p] = rdispls[p - 1] + recvcnts[p - 1];
+            sdispls[p] = sdispls[p - 1] + sendcnts[p - 1];
+            nblocks += recvcnts[p];
+        }
+
+        if ( !(blockr =  intMalloc_dist( nblocks )) )  ABORT("Malloc fails for blockr[].");
+        MPI_Alltoallv( blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, rdispls, mpi_int_t, grid->comm );
+        SUPERLU_FREE( blocks );
+
+        /* store the received U-blocks by rows */
+        nlb = nsupers / Pc;
+        if ( !(Ublock = intMalloc_dist( nblocks )) )  ABORT("Malloc fails for Ublock[].");
+        if ( !(Urows  = intMalloc_dist( 1 + nlb )) )  ABORT("Malloc fails for Urows[].");
+        k = 0;
+        for (jb = 0; jb < nlb; jb++ )
+        {
+            j = jb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[jb] = 0;
+
+            for ( pc = 0; pc < Pc; pc++ )
+            {
+                p = pr * Pc + pc; /* the processor owning this block of U-factor */
+
+                for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++)
+                {
+                    Ublock[k] = blockr[i];
+                    k++; Urows[jb] ++;
+                }
+                rdispls[p] += rrows[p * nrbp1 + lb];
+            }
+            /* sort by the column indices to make things easier for later on */
+
+#ifdef ISORT
+            isort1( Urows[jb], &(Ublock[k - Urows[jb]]) );
+#else
+            qsort( &(Ublock[k - Urows[jb]]), (size_t)(Urows[jb]), sizeof(int_t), &superlu_sort_perm );
+#endif
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            j = nlb * Pc + mycol;
+            pr = j % Pr;
+            lb = j / Pr;
+            Urows[nlb] = 0;
+
+            for ( pc = 0; pc < Pc; pc++ )
+            {
+                p = pr * Pc + pc;
+                for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++)
+                {
+                    Ublock[k] = blockr[i];
+                    k++; Urows[nlb] ++;
+                }
+                rdispls[p] += rrows[p * nrb + lb];
+            }
+#ifdef ISORT
+            isort1( Urows[nlb], &(Ublock[k - Urows[nlb]]) );
+#else
+            qsort( &(Ublock[k - Urows[nlb]]), (size_t)(Urows[nlb]), sizeof(int_t), &superlu_sort_perm );
+#endif
+        }
+        SUPERLU_FREE( blockr );
+
+        /* sort the block in L-factor */
+        nblocks = 0;
+        for ( lb = 0; lb < ncb; lb++ )
+        {
+            jb = lb * Pc + mycol;
+            //index = Llu->Lrowind_bc_ptr[lb];
+            index = Lrowind_bc_ptr[lb];
+            if ( index )   /* Not an empty column */
+            {
+                nblocks += index[0];
+            }
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            jb = ncb * Pc + mycol;
+            //index = Llu->Lrowind_bc_ptr[ncb];
+            index = Lrowind_bc_ptr[ncb];
+            if ( index )   /* Not an empty column */
+            {
+                nblocks += index[0];
+            }
+        }
+
+        if ( !(Lblock = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for Lblock[].");
+        if ( !(Lrows  = intMalloc_dist( 1 + ncb )) ) ABORT("Malloc fails for Lrows[].");
+        for ( lb = 0; lb <= ncb; lb++ ) Lrows[lb] = 0;
+        nblocks = 0;
+        for ( lb = 0; lb < ncb; lb++ )
+        {
+            Lrows[lb] = 0;
+
+            jb = lb * Pc + mycol;
+            //index = Llu->Lrowind_bc_ptr[lb];
+            index = Lrowind_bc_ptr[lb];
+            if ( index )   /* Not an empty column */
+            {
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW( jb, grid );
+                if ( krow == myrow )  /* skip the diagonal block */
+                {
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+
+                for ( j = 0; j < i; j++ )
+                {
+                    Lblock[nblocks] = index[k];
+                    Lrows[lb] ++;
+                    nblocks++;
+
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+            }
+#ifdef ISORT
+            isort1( Lrows[lb], &(Lblock[nblocks - Lrows[lb]]) );
+#else
+            qsort( &(Lblock[nblocks - Lrows[lb]]), (size_t)(Lrows[lb]), sizeof(int_t), &superlu_sort_perm );
+#endif
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            Lrows[ncb] = 0;
+            jb = ncb * Pc + mycol;
+            //index = Llu->Lrowind_bc_ptr[ncb];
+            index = Lrowind_bc_ptr[ncb];
+            if ( index )   /* Not an empty column */
+            {
+                i = index[0];
+                k = BC_HEADER;
+                krow = PROW( jb, grid );
+                if ( krow == myrow )  /* skip the diagonal block */
+                {
+                    k += LB_DESCRIPTOR + index[k + 1];
+                    i--;
+                }
+                for ( j = 0; j < i; j++ )
+                {
+                    Lblock[nblocks] = index[k];
+                    Lrows[ncb] ++;
+                    nblocks++;
+                    k += LB_DESCRIPTOR + index[k + 1];
+                }
+#ifdef ISORT
+                isort1( Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]) );
+#else
+                qsort( &(Lblock[nblocks - Lrows[ncb]]), (size_t)(Lrows[ncb]), sizeof(int_t), &superlu_sort_perm );
+#endif
+            }
+        }
+
+        /* look for the first local symmetric nonzero block match */
+        if ( !(sf_block   = intMalloc_dist( nsupers )) )
+            ABORT("Malloc fails for sf_block[].");
+        if ( !(sf_block_l = intMalloc_dist( nsupers )) )
+            ABORT("Malloc fails for sf_block_l[].");
+        for ( lb = 0; lb < nsupers; lb++ ) sf_block_l[lb] = nsupers;
+        i = 0; j = 0;
+        for ( jb = 0; jb < nlb; jb++ )
+        {
+            if ( Urows[jb] > 0 )
+            {
+                ib = i + Urows[jb];
+                lb = jb * Pc + mycol;
+                for ( k = 0; k < Lrows[jb]; k++ )
+                {
+                    while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++;
+
+                    if ( Ublock[i] == Lblock[j] )
+                    {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[jb] - k);
+                        k = Lrows[jb];
+                    }
+                    else
+                    {
+                        j++;
+                    }
+                }
+                i = ib;
+            }
+            else
+            {
+                j += Lrows[jb];
+            }
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            if ( Urows[nlb] > 0 )
+            {
+                ib = i + Urows[nlb];
+                lb = nlb * Pc + mycol;
+                for ( k = 0; k < Lrows[nlb]; k++ )
+                {
+                    while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++;
+
+                    if ( Ublock[i] == Lblock[j] )
+                    {
+                        sf_block_l[lb] = Lblock[j];
+                        j += (Lrows[nlb] - k);
+                        k = Lrows[nlb];
+                    }
+                    else
+                    {
+                        j++;
+                    }
+                }
+                i = ib;
+            }
+            else
+            {
+                j += Lrows[nlb];
+            }
+        }
+        /* compute the first global symmetric matchs */
+        MPI_Allreduce( sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, grid->comm );
+        SUPERLU_FREE( sf_block_l );
+
+        /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */
+        if ( !(nnodes_l = intMalloc_dist( nsupers )) )
+            ABORT("Malloc fails for nnodes_l[].");
+        if ( !(nnodes_u = intMalloc_dist( nsupers )) )
+            ABORT("Malloc fails for nnodes_u[].");
+        for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0;
+        for ( lb = 0; lb < nsupers; lb++ ) nnodes_u[lb] = 0;
+
+        nblocks = 0;
+        /* from U-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++ )
+        {
+            lb = jb * Pc + mycol;
+            ib = i + Urows[jb];
+            while ( i < ib )
+            {
+                if ( Ublock[i] <= sf_block[lb] )
+                {
+                    nnodes_u[lb] ++;
+                    i++; nblocks++;
+                }
+                else     /* get out*/
+                {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            lb = nlb * Pc + mycol;
+            ib = i + Urows[nlb];
+            while ( i < ib )
+            {
+                if ( Ublock[i] <= sf_block[lb] )
+                {
+                    nnodes_u[lb] ++;
+                    i++; nblocks++;
+                }
+                else     /* get out*/
+                {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+        /* from L-factor */
+        for (i = 0, jb = 0; jb < nlb; jb++ )
+        {
+            lb = jb * Pc + mycol;
+            ib = i + Lrows[jb];
+            while ( i < ib )
+            {
+                if ( Lblock[i] < sf_block[lb] )
+                {
+                    nnodes_l[lb] ++;
+                    i++; nblocks++;
+                }
+                else
+                {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+        if ( mycol < nsupers % grid->npcol )
+        {
+            lb = nlb * Pc + mycol;
+            ib = i + Lrows[nlb];
+            while ( i < ib )
+            {
+                if ( Lblock[i] < sf_block[lb] )
+                {
+                    nnodes_l[lb] ++;
+                    i++; nblocks++;
+                }
+                else
+                {
+                    i = ib;
+                }
+            }
+            i = ib;
+        }
+
+#ifdef USE_ALLGATHER
+        /* insert local nodes in DAG */
+        if ( !(edag_supno_l = intMalloc_dist( nsupers + nblocks )) )
+            ABORT("Malloc fails for edag_supno_l[].");
+        iu = il = nblocks = 0;
+        for ( lb = 0; lb < nsupers; lb++ )
+        {
+            j  = lb / Pc;
+            pc = lb % Pc;
+
+            edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++;
+            if ( mycol == pc )
+            {
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for ( jb = 0; jb < nnodes_u[lb]; jb++ )
+                {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++; nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for ( jb = 0; jb < nnodes_l[lb]; jb++ )
+                {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++; nblocks++;
+                }
+                il = ib;
+            }
+        }
+        SUPERLU_FREE( nnodes_u );
+
+        /* form global DAG on each processor */
+        MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm );
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for ( lb = 1; lb < Pc * Pr; lb++ )
+        {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if ( !(recvbuf = intMalloc_dist( nblocks )) )
+            ABORT("Malloc fails for recvbuf[].");
+        MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm );
+        SUPERLU_FREE(edag_supno_l);
+
+        if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) )
+            ABORT("Malloc fails for edag_supno[].");
+        k = 0;
+        for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0;
+        for ( p = 0; p < Pc * Pr; p++ )
+        {
+            for ( lb = 0; lb < nsupers; lb++ )
+            {
+                nnodes_l[lb] += recvbuf[k];
+                k += (1 + recvbuf[k]);
+            }
+        }
+        for ( lb = 0; lb < nsupers; lb++ )
+        {
+            if ( nnodes_l[lb] > 0 )
+                if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) )
+                    ABORT("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for ( p = 0; p < Pc * Pr; p++ )
+        {
+            for ( lb = 0; lb < nsupers; lb++ )
+            {
+                jb = k + recvbuf[k] + 1;
+                k ++;
+                for ( ; k < jb; k++ )
+                {
+                    edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                    nnodes_l[lb] ++;
+                }
+            }
+        }
+        SUPERLU_FREE(recvbuf);
+#else
+        int nlsupers = nsupers / Pc;
+        if ( mycol < nsupers % Pc ) nlsupers ++;
+
+        /* insert local nodes in DAG */
+        if ( !(edag_supno_l = intMalloc_dist( nlsupers + nblocks )) )
+            ABORT("Malloc fails for edag_supno_l[].");
+        iu = il = nblocks = 0;
+        for ( lb = 0; lb < nsupers; lb++ )
+        {
+            j  = lb / Pc;
+            pc = lb % Pc;
+            if ( mycol == pc )
+            {
+                edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++;
+                /* from U-factor */
+                ib = iu + Urows[j];
+                for ( jb = 0; jb < nnodes_u[lb]; jb++ )
+                {
+                    edag_supno_l[nblocks] = Ublock[iu];
+                    iu++; nblocks++;
+                }
+                iu = ib;
+
+                /* from L-factor */
+                ib = il + Lrows[j];
+                for ( jb = 0; jb < nnodes_l[lb]; jb++ )
+                {
+                    edag_supno_l[nblocks] = Lblock[il];
+                    il++; nblocks++;
+                }
+                il = ib;
+            }
+            else if ( nnodes_l[lb] + nnodes_u[lb] != 0 )
+		printf( " # %d: nnodes[%d]=%d+%d\n", grid->iam, 
+			(int) lb, (int) nnodes_l[lb], (int) nnodes_u[lb] );
+        }
+        SUPERLU_FREE( nnodes_u );
+        /* form global DAG on each processor */
+        MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm );
+        nblocks = recvcnts[0];
+        rdispls[0] = 0;
+        for ( lb = 1; lb < Pc * Pr; lb++ )
+        {
+            rdispls[lb] = nblocks;
+            nblocks += recvcnts[lb];
+        }
+        if ( !(recvbuf = intMalloc_dist( nblocks )) )
+            ABORT("Malloc fails for recvbuf[].");
+
+        MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t,
+                        recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm );
+        SUPERLU_FREE(edag_supno_l);
+
+        if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) )
+            ABORT("Malloc fails for edag_supno[].");
+        k = 0;
+        for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0;
+        for ( p = 0; p < Pc * Pr; p++ )
+        {
+            yourcol = MYCOL( p, grid );
+
+            for ( lb = 0; lb < nsupers; lb++ )
+            {
+                j  = lb / Pc;
+                pc = lb % Pc;
+                if ( yourcol == pc )
+                {
+                    nnodes_l[lb] += recvbuf[k];
+                    k += (1 + recvbuf[k]);
+                }
+            }
+        }
+        for ( lb = 0; lb < nsupers; lb++ )
+        {
+            if ( nnodes_l[lb] > 0 )
+                if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) )
+                    ABORT("Malloc fails for edag_supno[lb].");
+            nnodes_l[lb] = 0;
+        }
+        k = 0;
+        for ( p = 0; p < Pc * Pr; p++ )
+        {
+            yourcol = MYCOL( p, grid );
+
+            for ( lb = 0; lb < nsupers; lb++ )
+            {
+                j  = lb / Pc;
+                pc = lb % Pc;
+                if ( yourcol == pc )
+                {
+                    jb = k + recvbuf[k] + 1;
+                    k ++;
+                    for ( ; k < jb; k++ )
+                    {
+                        edag_supno[lb][nnodes_l[lb]] = recvbuf[k];
+                        nnodes_l[lb] ++;
+                    }
+                }
+            }
+        }
+        SUPERLU_FREE(recvbuf);
+#endif
+
+        /* initialize the num of child for each node */
+        num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+        for ( i = 0; i < nsupers; i++ ) num_child[i] = 0;
+        for ( i = 0; i < nsupers; i++ )
+        {
+            for ( jb = 0; jb < nnodes_l[i]; jb++ )
+            {
+                num_child[edag_supno[i][jb]]++;
+            }
+        }
+
+        /* push initial leaves to the fifo queue */
+        nnodes = 0;
+        for ( i = 0; i < nsupers; i++ )
+        {
+            if ( num_child[i] == 0 )
+            {
+                ptr = SUPERLU_MALLOC( sizeof(etree_node) );
+                ptr->id = i;
+                ptr->next = NULL;
+                /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/
+                nnodes ++;
+
+                if ( nnodes == 1 )
+                {
+                    head = ptr;
+                    tail = ptr;
+                }
+                else
+                {
+                    tail->next = ptr;
+                    tail = ptr;
+                }
+            }
+        }
+
+        /* process fifo queue, and compute the ordering */
+        i = 0;
+        perm_c_supno  = SUPERLU_MALLOC( nsupers * sizeof(int_t) );
+        while ( nnodes > 0 )
+        {
+
+            /*printf( "=== pop %d (%d) ===\n",head->id,i );*/
+            ptr = head;  j = ptr->id;
+            head = ptr->next;
+
+            perm_c_supno[i] = j;
+            SUPERLU_FREE(ptr);
+            i++; nnodes --;
+
+            for ( jb = 0; jb < nnodes_l[j]; jb++ )
+            {
+                num_child[edag_supno[j][jb]]--;
+                if ( num_child[edag_supno[j][jb]] == 0 )
+                {
+                    nnodes ++;
+
+                    ptr = SUPERLU_MALLOC( sizeof(etree_node) );
+                    ptr->id = edag_supno[j][jb];
+                    ptr->next = NULL;
+
+                    /*printf( "=== push %d ===\n",ptr->id );*/
+                    if ( nnodes == 1 )
+                    {
+                        head = ptr;
+                        tail = ptr;
+                    }
+                    else
+                    {
+                        tail->next = ptr;
+                        tail = ptr;
+                    }
+                }
+            }
+            /*printf( "\n" );*/
+        }
+        SUPERLU_FREE(num_child);
+
+        for ( lb = 0; lb < nsupers; lb++ ) if ( nnodes_l[lb] > 0 ) SUPERLU_FREE(edag_supno[lb] );
+        SUPERLU_FREE(edag_supno);
+        SUPERLU_FREE(nnodes_l);
+        SUPERLU_FREE(sendcnts);
+        SUPERLU_FREE(sf_block);
+        SUPERLU_FREE(Ublock);
+        SUPERLU_FREE(Urows);
+        SUPERLU_FREE(Lblock);
+        SUPERLU_FREE(Lrows);
+    }
+    /* ======================== *
+     * end of static scheduling *
+     * ======================== */
+
+    return perm_c_supno;
+} /* getPerm_c_supno */
+
+
+int_t Trs2_InitUblock_info(int_t klst, int_t nb,
+			    Ublock_info_t *Ublock_info,
+			    int_t *usub,
+			    Glu_persist_t *Glu_persist, SuperLUStat_t *stat )
+{
+    int_t *xsup = Glu_persist->xsup;
+    int_t iukp, rukp;
+    iukp = BR_HEADER;
+    rukp = 0;
+
+    for (int_t b = 0; b < nb; ++b)
+    {
+        int_t gb = usub[iukp];
+        int_t nsupc = SuperSize (gb);
+
+        Ublock_info[b].iukp = iukp;
+        Ublock_info[b].rukp = rukp;
+        // Ublock_info[b].nsupc = nsupc;
+
+        iukp += UB_DESCRIPTOR;
+	/* Sherry: can remove this loop for rukp
+	   rukp += usub[iukp-1];
+	 */
+       for (int_t j = 0; j < nsupc; ++j)
+        {
+            int_t segsize = klst - usub[iukp++];
+            rukp += segsize;
+            stat->ops[FACT] += segsize * (segsize + 1);
+        }
+    }
+    return 0;
+}
+
+void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup,
+		  int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+		  gridinfo3d_t * grid3d
+		  )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    //int_t** Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr;
+    //int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+    //int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t * perm_u = INT_T_ALLOC(nsupers);
+    int_t * mylsize = INT_T_ALLOC(nsupers);
+    int_t * myusize = INT_T_ALLOC(nsupers);
+    // int_t * maxlsize = INT_T_ALLOC(nsupers);
+    // int_t * maxusize = INT_T_ALLOC(nsupers);
+    int ldu;
+
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+        mylsize[i] = 0;
+        myusize[i] = 0;
+    }
+
+    for (int_t k = 0; k < nsupers ; ++k)
+    {
+        treeList[k].scuWeight = 0.0;
+        int_t iam = grid->iam;
+        int_t myrow = MYROW (iam, grid);
+        int_t mycol = MYCOL (iam, grid);
+        // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+        int_t krow = PROW (k, grid);
+        int_t kcol = PCOL (k, grid);
+	int_t ldu;
+
+        if (myrow == krow)
+        {
+            /* code */
+            myusize[k] = num_full_cols_U(k,  Ufstnz_br_ptr, xsup, grid,
+					 perm_u, &ldu);
+        }
+
+        if (mycol == kcol)
+        {
+            /* code */
+            int_t lk = LBj( k, grid ); /* Local block number */
+            int_t  *lsub;
+            // double* lnzval;
+            lsub = Lrowind_bc_ptr[lk];
+            if (lsub)
+            {
+                /* code */
+                mylsize[k] = lsub[1];
+            }
+        }
+    }
+
+    // int_t maxlsize = 0;
+    MPI_Allreduce( MPI_IN_PLACE, mylsize, nsupers, mpi_int_t, MPI_MAX, grid->comm );
+    // int_t maxusize = 0;
+    MPI_Allreduce(  MPI_IN_PLACE, myusize, nsupers, mpi_int_t, MPI_MAX, grid->comm );
+
+    for (int_t k = 0; k < nsupers ; ++k)
+    {
+
+        treeList[k].scuWeight = 0.0;
+        int_t ksupc = SuperSize(k);
+        treeList[k].scuWeight = 1.0 * ksupc * mylsize[k] * myusize[k];
+    }
+
+    SUPERLU_FREE(mylsize);
+    SUPERLU_FREE(myusize);
+    SUPERLU_FREE(perm_u);
+
+} /* getSCUweight */
+
diff --git a/SRC/util.c b/SRC/util.c
index c70ee409..30175ee2 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -26,273 +26,65 @@ at the top-level directory.
 #include "superlu_ddefs.h"
 
 /*! \brief Deallocate the structure pointing to the actual storage of the matrix. */
-void
-Destroy_SuperMatrix_Store_dist(SuperMatrix *A)
+void Destroy_SuperMatrix_Store_dist(SuperMatrix *A)
 {
-    SUPERLU_FREE ( A->Store );
+    SUPERLU_FREE(A->Store);
 }
 
-void
-Destroy_CompCol_Matrix_dist(SuperMatrix *A)
+void Destroy_CompCol_Matrix_dist(SuperMatrix *A)
 {
     NCformat *Astore = A->Store;
-    SUPERLU_FREE( Astore->rowind );
-    SUPERLU_FREE( Astore->colptr );
-    if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval );
-    SUPERLU_FREE( Astore );
+    SUPERLU_FREE(Astore->rowind);
+    SUPERLU_FREE(Astore->colptr);
+    if (Astore->nzval)
+        SUPERLU_FREE(Astore->nzval);
+    SUPERLU_FREE(Astore);
 }
 
-void
-Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A)
+void Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A)
 {
     NRformat_loc *Astore = A->Store;
-    SUPERLU_FREE( Astore->rowptr );
-    SUPERLU_FREE( Astore->colind );
-    SUPERLU_FREE( Astore->nzval );
-    SUPERLU_FREE( Astore );
+    SUPERLU_FREE(Astore->rowptr);
+    SUPERLU_FREE(Astore->colind);
+    SUPERLU_FREE(Astore->nzval);
+    SUPERLU_FREE(Astore);
 }
 
-void
-Destroy_CompRow_Matrix_dist(SuperMatrix *A)
+void Destroy_CompRow_Matrix_dist(SuperMatrix *A)
 {
-    SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
-    SUPERLU_FREE( ((NRformat *)A->Store)->colind );
-    SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
-    SUPERLU_FREE( A->Store );
+    SUPERLU_FREE(((NRformat *)A->Store)->rowptr);
+    SUPERLU_FREE(((NRformat *)A->Store)->colind);
+    SUPERLU_FREE(((NRformat *)A->Store)->nzval);
+    SUPERLU_FREE(A->Store);
 }
 
-void
-Destroy_SuperNode_Matrix_dist(SuperMatrix *A)
+void Destroy_SuperNode_Matrix_dist(SuperMatrix *A)
 {
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
-    SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
-    SUPERLU_FREE ( A->Store );
+    SUPERLU_FREE(((SCformat *)A->Store)->rowind);
+    SUPERLU_FREE(((SCformat *)A->Store)->rowind_colptr);
+    SUPERLU_FREE(((SCformat *)A->Store)->nzval);
+    SUPERLU_FREE(((SCformat *)A->Store)->nzval_colptr);
+    SUPERLU_FREE(((SCformat *)A->Store)->col_to_sup);
+    SUPERLU_FREE(((SCformat *)A->Store)->sup_to_col);
+    SUPERLU_FREE(A->Store);
 }
 
 /*! \brief A is of type Stype==NCP */
-void
-Destroy_CompCol_Permuted_dist(SuperMatrix *A)
+void Destroy_CompCol_Permuted_dist(SuperMatrix *A)
 {
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
-    SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
-    SUPERLU_FREE ( A->Store );
+    SUPERLU_FREE(((NCPformat *)A->Store)->colbeg);
+    SUPERLU_FREE(((NCPformat *)A->Store)->colend);
+    SUPERLU_FREE(A->Store);
 }
 
 /*! \brief A is of type Stype==DN */
-void
-Destroy_Dense_Matrix_dist(SuperMatrix *A)
+void Destroy_Dense_Matrix_dist(SuperMatrix *A)
 {
-    DNformat* Astore = A->Store;
-    SUPERLU_FREE (Astore->nzval);
-    SUPERLU_FREE ( A->Store );
-}
-
-#if 0 // moved to precision-dependent routines
-
-/*! \brief Destroy distributed L & U matrices. */
-void
-Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
-{
-    int_t i, nb, nsupers;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter Destroy_Tree()");
-#endif
-
-    nsupers = Glu_persist->supno[n-1] + 1;
-
-	nb = CEILING(nsupers, grid->npcol);
-	for (i=0;i<nb;++i){
-        if(Llu->LBtree_ptr[i].empty_==NO){    
-			// BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
-            C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
-		}
-        if(Llu->UBtree_ptr[i].empty_==NO){  
-			// BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
-            C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
-		}		
-	}
-	SUPERLU_FREE(Llu->LBtree_ptr);
-	SUPERLU_FREE(Llu->UBtree_ptr);
-	
- 	nb = CEILING(nsupers, grid->nprow);
-	for (i=0;i<nb;++i){
-        if(Llu->LRtree_ptr[i].empty_==NO){             
-			// RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
-            C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
-		}
-        if(Llu->URtree_ptr[i].empty_==NO){ 
-			// RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
-            C_RdTree_Nullify(&Llu->URtree_ptr[i]);
-		}		
-	}
-	SUPERLU_FREE(Llu->LRtree_ptr);
-	SUPERLU_FREE(Llu->URtree_ptr);
-
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit Destroy_Tree()");
-#endif
+    DNformat *Astore = A->Store;
+    SUPERLU_FREE(Astore->nzval);
+    SUPERLU_FREE(A->Store);
 }
 
-/*! \brief Destroy distributed L & U matrices. */
-void
-Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
-{
-    int_t i, nb, nsupers;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter Destroy_LU()");
-#endif
-
-    Destroy_Tree(n, grid, LUstruct);
-
-    nsupers = Glu_persist->supno[n-1] + 1;
-
-    nb = CEILING(nsupers, grid->npcol);
-    // for (i = 0; i < nb; ++i) 
-	// if ( Llu->Lrowind_bc_ptr[i] ) {
-	    // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-	    // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
-	// }
-    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
-    SUPERLU_FREE (Llu->Lrowind_bc_dat);
-    SUPERLU_FREE (Llu->Lrowind_bc_offset);    
-    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-    SUPERLU_FREE (Llu->Lnzval_bc_dat);
-    SUPERLU_FREE (Llu->Lnzval_bc_offset);
-
-    nb = CEILING(nsupers, grid->nprow);
-    for (i = 0; i < nb; ++i)
-	if ( Llu->Ufstnz_br_ptr[i] ) {
-	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
-	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
-	}
-    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
-    SUPERLU_FREE (Llu->Unzval_br_ptr);
-
-    /* The following can be freed after factorization. */
-    SUPERLU_FREE(Llu->ToRecv);
-    SUPERLU_FREE(Llu->ToSendD);
-    SUPERLU_FREE(Llu->ToSendR[0]);
-    SUPERLU_FREE(Llu->ToSendR);
-
-    /* The following can be freed only after iterative refinement. */
-    SUPERLU_FREE(Llu->ilsum);
-    SUPERLU_FREE(Llu->fmod);
-    SUPERLU_FREE(Llu->fsendx_plist[0]);
-    SUPERLU_FREE(Llu->fsendx_plist);
-    SUPERLU_FREE(Llu->bmod);
-    SUPERLU_FREE(Llu->bsendx_plist[0]);
-    SUPERLU_FREE(Llu->bsendx_plist);
-    SUPERLU_FREE(Llu->mod_bit);
-
-    // nb = CEILING(nsupers, grid->npcol);
-    // for (i = 0; i < nb; ++i) 
-	// if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
-	//     SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
-	// }	
-    SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
-    SUPERLU_FREE(Llu->Lindval_loc_bc_dat);
-    SUPERLU_FREE(Llu->Lindval_loc_bc_offset);
-	
-    nb = CEILING(nsupers, grid->npcol);
-    for (i=0; i<nb; ++i) {
-	// if(Llu->Linv_bc_ptr[i]!=NULL) {
-	//     SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
-	// }
-
-	if(Llu->Uinv_bc_ptr[i]!=NULL){
-	    SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
-	}	
-    }
-    SUPERLU_FREE(Llu->Linv_bc_ptr);
-    SUPERLU_FREE(Llu->Linv_bc_dat);
-    SUPERLU_FREE(Llu->Linv_bc_offset);
-    SUPERLU_FREE(Llu->Uinv_bc_ptr);
-    SUPERLU_FREE(Llu->Unnz);
-	
-    nb = CEILING(nsupers, grid->npcol);
-    for (i = 0; i < nb; ++i)
-	if ( Llu->Urbs[i] ) {
-	    SUPERLU_FREE(Llu->Ucb_indptr[i]);
-	    SUPERLU_FREE(Llu->Ucb_valptr[i]);
-	}
-    SUPERLU_FREE(Llu->Ucb_indptr);
-    SUPERLU_FREE(Llu->Ucb_valptr);	
-    SUPERLU_FREE(Llu->Urbs);
-
-    SUPERLU_FREE(Glu_persist->xsup);
-    SUPERLU_FREE(Glu_persist->supno);
-
-#ifdef GPU_ACC
-	checkGPU (gpuFree (Llu->d_xsup));
-	checkGPU (gpuFree (Llu->d_LRtree_ptr));
-	checkGPU (gpuFree (Llu->d_LBtree_ptr));
-	checkGPU (gpuFree (Llu->d_ilsum));
-	checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
-	checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
-	checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
-	checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
-	checkGPU (gpuFree (Llu->d_Linv_bc_dat));
-	checkGPU (gpuFree (Llu->d_Linv_bc_offset));
-	checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
-	checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
-#endif
-
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit Destroy_LU()");
-#endif
-}
-
-/*! \brief Allocate storage in LUstruct */
-void LUstructInit(const int_t n, LUstruct_t *LUstruct)
-{
-    if ( !(LUstruct->etree = intMalloc_dist(n)) )
-	ABORT("Malloc fails for etree[].");
-    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
-	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
-	ABORT("Malloc fails for Glu_persist_t.");
-    if ( !(LUstruct->Llu = (LocalLU_t *)
-	   SUPERLU_MALLOC(sizeof(LocalLU_t))) )
-	ABORT("Malloc fails for LocalLU_t.");
-	LUstruct->Llu->inv = 0;
-}
-
-/*! \brief Deallocate LUstruct */
-void LUstructFree(LUstruct_t *LUstruct)
-{
-#if ( DEBUGlevel>=1 )
-    int iam;
-    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter LUstructFree()");
-#endif
-
-    SUPERLU_FREE(LUstruct->etree);
-    SUPERLU_FREE(LUstruct->Glu_persist);
-    SUPERLU_FREE(LUstruct->Llu);
-
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit LUstructFree()");
-#endif
-}
-
-#endif // removed
-
-
 /*! \brief
  *
  * <pre>
@@ -300,57 +92,60 @@ void LUstructFree(LUstruct_t *LUstruct)
  * symmetrically reduced L. 
  * </pre>
  */
-void
-countnz_dist(const int_t n, int_t *xprune,
-	     int_t *nnzL, int_t *nnzU, 
-	     Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
+void countnz_dist(const int_t n, int_t *xprune,
+                  int_t *nnzL, int_t *nnzU,
+                  Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
 {
-    int_t  fnz, fsupc, i, j, nsuper;
-    int_t  jlen, irep;
+    int_t fnz, fsupc, i, j, nsuper;
+    int_t jlen, irep;
     long long int nnzL0;
-    int_t  *supno, *xsup, *xlsub, *xusub, *usub;
-
-    supno  = Glu_persist->supno;
-    xsup   = Glu_persist->xsup;
-    xlsub  = Glu_freeable->xlsub;
-    xusub  = Glu_freeable->xusub;
-    usub   = Glu_freeable->usub;
-    *nnzL  = 0;
-    *nnzU  = 0;
-    nnzL0  = 0;
+    int_t *supno, *xsup, *xlsub, *xusub, *usub;
+
+    supno = Glu_persist->supno;
+    xsup = Glu_persist->xsup;
+    xlsub = Glu_freeable->xlsub;
+    xusub = Glu_freeable->xusub;
+    usub = Glu_freeable->usub;
+    *nnzL = 0;
+    *nnzU = 0;
+    nnzL0 = 0;
     nsuper = supno[n];
 
-    if ( n <= 0 ) return;
+    if (n <= 0)
+        return;
 
     /* 
      * For each supernode in L.
      */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jlen = xlsub[fsupc+1] - xlsub[fsupc];
-
-	for (j = fsupc; j < xsup[i+1]; j++) {
-	    *nnzL += jlen;
-	    *nnzU += j - fsupc + 1;
-	    jlen--;
-	}
-	irep = xsup[i+1] - 1;
-	nnzL0 += xprune[irep] - xlsub[irep];
+    for (i = 0; i <= nsuper; i++)
+    {
+        fsupc = xsup[i];
+        jlen = xlsub[fsupc + 1] - xlsub[fsupc];
+
+        for (j = fsupc; j < xsup[i + 1]; j++)
+        {
+            *nnzL += jlen;
+            *nnzU += j - fsupc + 1;
+            jlen--;
+        }
+        irep = xsup[i + 1] - 1;
+        nnzL0 += xprune[irep] - xlsub[irep];
     }
-    
+
     /* printf("\tNo of nonzeros in symm-reduced L = %ld\n", nnzL0);*/
-    
+
     /* For each column in U. */
-    for (j = 0; j < n; ++j) {
-	for (i = xusub[j]; i < xusub[j+1]; ++i) {
-	    fnz = usub[i];
-	    fsupc = xsup[supno[fnz]+1];
-	    *nnzU += fsupc - fnz;
-	}
+    for (j = 0; j < n; ++j)
+    {
+        for (i = xusub[j]; i < xusub[j + 1]; ++i)
+        {
+            fnz = usub[i];
+            fsupc = xsup[supno[fnz] + 1];
+            *nnzU += fsupc - fnz;
+        }
     }
 }
 
-
 /*! \brief
  *
  * <pre>
@@ -360,36 +155,38 @@ countnz_dist(const int_t n, int_t *xprune,
  * </pre>
  */
 int64_t
-fixupL_dist(const int_t n, const int_t *perm_r, 
-	    Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
+fixupL_dist(const int_t n, const int_t *perm_r,
+            Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
 {
     register int_t nsuper, fsupc, nextl, i, j, k, jstrt;
     register long long int lsub_size;
-    int_t          *xsup, *lsub, *xlsub;
+    int_t *xsup, *lsub, *xlsub;
 
-    if ( n <= 1 ) return 0;
+    if (n <= 1)
+        return 0;
 
-    xsup   = Glu_persist->xsup;
-    lsub   = Glu_freeable->lsub;
-    xlsub  = Glu_freeable->xlsub;
-    nextl  = 0;
+    xsup = Glu_persist->xsup;
+    lsub = Glu_freeable->lsub;
+    xlsub = Glu_freeable->xlsub;
+    nextl = 0;
     nsuper = (Glu_persist->supno)[n];
     lsub_size = xlsub[n];
-    
+
     /* 
      * For each supernode ...
      */
-    for (i = 0; i <= nsuper; i++) {
-	fsupc = xsup[i];
-	jstrt = xlsub[fsupc];
-	xlsub[fsupc] = nextl;
-	for (j = jstrt; j < xlsub[fsupc+1]; j++) {
-	    lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
-	    nextl++;
-  	}
-	for (k = fsupc+1; k < xsup[i+1]; k++) 
-	    	xlsub[k] = nextl;	/* Other columns in supernode i */
-
+    for (i = 0; i <= nsuper; i++)
+    {
+        fsupc = xsup[i];
+        jstrt = xlsub[fsupc];
+        xlsub[fsupc] = nextl;
+        for (j = jstrt; j < xlsub[fsupc + 1]; j++)
+        {
+            lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
+            nextl++;
+        }
+        for (k = fsupc + 1; k < xsup[i + 1]; k++)
+            xlsub[k] = nextl; /* Other columns in supernode i */
     }
 
     xlsub[n] = nextl;
@@ -400,28 +197,29 @@ fixupL_dist(const int_t n, const int_t *perm_r,
  */
 void set_default_options_dist(superlu_dist_options_t *options)
 {
-    options->Fact              = DOFACT;
-    options->Equil             = YES;
-    options->ParSymbFact       = NO;
+    options->Fact = DOFACT;
+    options->Equil = YES;
+    options->ParSymbFact = NO;
 #ifdef HAVE_PARMETIS
-    options->ColPerm           = METIS_AT_PLUS_A;
+    options->ColPerm = METIS_AT_PLUS_A;
 #else
-    options->ColPerm            = MMD_AT_PLUS_A;
+    options->ColPerm = MMD_AT_PLUS_A;
 #endif
-    options->RowPerm           = LargeDiag_MC64;
-    options->ReplaceTinyPivot  = NO;
-    options->IterRefine        = SLU_DOUBLE;
-    options->Trans             = NOTRANS;
-    options->SolveInitialized  = NO;
+    options->RowPerm = LargeDiag_MC64;
+    options->ReplaceTinyPivot = NO;
+    options->IterRefine = SLU_DOUBLE;
+    options->Trans = NOTRANS;
+    options->SolveInitialized = NO;
     options->RefineInitialized = NO;
-    options->PrintStat         = YES;
-    options->num_lookaheads    = 10;
-    options->lookahead_etree   = NO;
-    options->SymPattern        = NO;
+    options->PrintStat = YES;
+    options->num_lookaheads = 10;
+    options->lookahead_etree = NO;
+    options->SymPattern = NO;
+    options->Algo3d = NO;
 #ifdef SLU_HAVE_LAPACK
-    options->DiagInv           = YES;
+    options->DiagInv = YES;
 #else
-    options->DiagInv           = NO;
+    options->DiagInv = NO;
 #endif
 }
 
@@ -429,7 +227,8 @@ void set_default_options_dist(superlu_dist_options_t *options)
  */
 void print_options_dist(superlu_dist_options_t *options)
 {
-    if ( options->PrintStat == NO ) return;
+    if (options->PrintStat == NO)
+        return;
 
     printf("**************************************************\n");
     printf(".. options:\n");
@@ -452,14 +251,15 @@ void print_options_dist(superlu_dist_options_t *options)
  */
 void print_sp_ienv_dist(superlu_dist_options_t *options)
 {
-    if ( options->PrintStat == NO ) return;
+    if (options->PrintStat == NO)
+        return;
 
     printf("**************************************************\n");
     printf(".. blocking parameters from sp_ienv():\n");
-    printf("**    relaxation                 : " IFMT "\n", sp_ienv_dist(2));
-    printf("**    max supernode              : " IFMT "\n", sp_ienv_dist(3));
-    printf("**    estimated fill ratio       : " IFMT "\n", sp_ienv_dist(6));
-    printf("**    min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7));
+    printf("**    relaxation                 : %d\n", sp_ienv_dist(2));
+    printf("**    max supernode              : %d\n", sp_ienv_dist(3));
+    printf("**    estimated fill ratio       : %d\n", sp_ienv_dist(6));
+    printf("**    min GEMM m*k*n to use GPU  : %d\n", sp_ienv_dist(7));
     printf("**************************************************\n");
 }
 
@@ -471,33 +271,32 @@ void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm)
     SUPERLU_FREE(gstrs_comm);
 }
 
-
 /*! \brief Diagnostic print of segment info after panel_dfs().
  */
-void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, 
-			  int_t *segrep, int_t *repfnz)
+void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg,
+                          int_t *segrep, int_t *repfnz)
 {
     int_t j, k;
-    
-    for (j = jcol; j < jcol+w; j++) {
-	printf("\tcol " IFMT ":\n", j);
-	for (k = 0; k < nseg; k++)
-	    printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, 
-			segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
-    }
 
+    for (j = jcol; j < jcol + w; j++)
+    {
+        printf("\tcol " IFMT ":\n", j);
+        for (k = 0; k < nseg; k++)
+            printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k,
+                   segrep[k], repfnz[(j - jcol) * n + segrep[k]]);
+    }
 }
 
-void
-PStatInit(SuperLUStat_t *stat)
+void PStatInit(SuperLUStat_t *stat)
 {
     register int_t i;
 
-    if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) )
-	ABORT("Malloc fails for stat->utime[]");
-    if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) )
-	ABORT("SUPERLU_MALLOC fails for stat->ops[]");
-    for (i = 0; i < NPHASES; ++i) {
+    if (!(stat->utime = SUPERLU_MALLOC(NPHASES * sizeof(double))))
+        ABORT("Malloc fails for stat->utime[]");
+    if (!(stat->ops = (flops_t *)SUPERLU_MALLOC(NPHASES * sizeof(flops_t))))
+        ABORT("SUPERLU_MALLOC fails for stat->ops[]");
+    for (i = 0; i < NPHASES; ++i)
+    {
         stat->utime[i] = 0.;
         stat->ops[i] = 0.;
     }
@@ -506,168 +305,179 @@ PStatInit(SuperLUStat_t *stat)
     stat->gpu_buffer = 0.0;
 }
 
-void
-PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
+void PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
 {
-    double  *utime = stat->utime;
+    double *utime = stat->utime;
     flops_t *ops = stat->ops;
-    int_t   iam = grid->iam;
+    int_t iam = grid->iam;
     flops_t flopcnt, factflop, solveflop;
 
-    if ( options->PrintStat == NO ) return;
-
-    if ( !iam && options->Fact != FACTORED ) {
-	printf("**************************************************\n");
-	printf("**** Time (seconds) ****\n");
+    if (options->PrintStat == NO)
+        return;
 
+    if (!iam && options->Fact != FACTORED)
+    {
+        printf("**************************************************\n");
+        printf("**** Time (seconds) ****\n");
         if ( options->Equil != NO )
-	    printf("\tEQUIL time         %8.2f\n", utime[EQUIL]);
+	    printf("\tEQUIL time         %8.3f\n", utime[EQUIL]);
 	if ( options->RowPerm != NOROWPERM )
-	    printf("\tROWPERM time       %8.2f\n", utime[ROWPERM]);
+	    printf("\tROWPERM time       %8.3f\n", utime[ROWPERM]);
 	if ( options->ColPerm != NATURAL )
-	    printf("\tCOLPERM time       %8.2f\n", utime[COLPERM]);
-        printf("\tSYMBFACT time      %8.2f\n", utime[SYMBFAC]);
-	printf("\tDISTRIBUTE time    %8.2f\n", utime[DIST]);
-
+	    printf("\tCOLPERM time       %8.3f\n", utime[COLPERM]);
+        printf("\tSYMBFACT time      %8.3f\n", utime[SYMBFAC]);
+	printf("\tDISTRIBUTE time    %8.3f\n", utime[DIST]);
     }
 
     MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
-	       0, grid->comm);
+               0, grid->comm);
     factflop = flopcnt;
     if ( !iam && options->Fact != FACTORED ) {
-	printf("\tFACTOR time        %8.2f\n", utime[FACT]);
+	printf("\tFACTOR time        %8.3f\n", utime[FACT]);
 	if ( utime[FACT] != 0.0 )
 	    printf("\tFactor flops\t%e\tMflops \t%8.2f\n",
 		   flopcnt,
 		   flopcnt*1e-6/utime[FACT]);
     }
-	
-    MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 
-	       0, grid->comm);
+
+    MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
+               0, grid->comm);
     solveflop = flopcnt;
-    if ( !iam ) {
-	printf("\tSOLVE time         %8.3f\n", utime[SOLVE]);
-	if ( utime[SOLVE] != 0.0 )
-	    printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
-		   flopcnt,
-		   flopcnt*1e-6/utime[SOLVE]);
-	if ( options->IterRefine != NOREFINE ) {
-	    printf("\tREFINEMENT time    %8.3f\tSteps%8d\n\n",
-		   utime[REFINE], stat->RefineSteps);
-	}
-	printf("**************************************************\n");
+    if (!iam)
+    {
+        printf("\tSOLVE time         %8.3f\n", utime[SOLVE]);
+        if (utime[SOLVE] != 0.0)
+            printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
+                   flopcnt,
+                   flopcnt * 1e-6 / utime[SOLVE]);
+        if (options->IterRefine != NOREFINE)
+        {
+            printf("\tREFINEMENT time    %8.3f\tSteps%8d\n\n",
+                   utime[REFINE], stat->RefineSteps);
+        }
+        printf("**************************************************\n");
     }
 
-	double  *utime1,*utime2,*utime3,*utime4;
-	flops_t  *ops1;
-#if ( PROFlevel>=1 )
-	fflush(stdout);
-    MPI_Barrier( grid->comm );
+    double *utime1, *utime2, *utime3, *utime4;
+    flops_t *ops1;
+#if (PROFlevel >= 1)
+    fflush(stdout);
+    MPI_Barrier(grid->comm);
 
     {
-	int_t i, P = grid->nprow*grid->npcol;
-	flops_t b, maxflop;
-	
-		
-	if ( !iam )utime1=doubleMalloc_dist(P);
-	if ( !iam )utime2=doubleMalloc_dist(P);
-	if ( !iam )utime3=doubleMalloc_dist(P);
-	if ( !iam )utime4=doubleMalloc_dist(P);
-	if ( !iam )ops1=(flops_t *) SUPERLU_MALLOC(P * sizeof(flops_t));
+        int_t i, P = grid->nprow * grid->npcol;
+        flops_t b, maxflop;
+
+        if (!iam)
+            utime1 = doubleMalloc_dist(P);
+        if (!iam)
+            utime2 = doubleMalloc_dist(P);
+        if (!iam)
+            utime3 = doubleMalloc_dist(P);
+        if (!iam)
+            utime4 = doubleMalloc_dist(P);
+        if (!iam)
+            ops1 = (flops_t *)SUPERLU_MALLOC(P * sizeof(flops_t));
+
+        // fflush(stdout);
+        // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n");
+        // fflush(stdout);
+        // sleep(2.0);
+        // MPI_Barrier( grid->comm );
+        // for (i = 0; i < P; ++i) {
+        // if ( iam == i) {
+        // printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees);
+        // fflush(stdout);
+        // }
+        // MPI_Barrier( grid->comm );
+        // }
+
+        // sleep(2.0);
+
+        MPI_Barrier(grid->comm);
+
+        if (!iam)
+            printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
+
+        MPI_Gather(&utime[COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm);
+        MPI_Gather(&utime[FACT], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm);
+        if (!iam)
+            for (i = 0; i < P; ++i)
+            {
+                printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]);
+            }
+        fflush(stdout);
+        MPI_Barrier(grid->comm);
 
-	
-	// fflush(stdout); 
-	// if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n");
-	// fflush(stdout);
-	// sleep(2.0); 	
-	// MPI_Barrier( grid->comm );
-	// for (i = 0; i < P; ++i) {
-	    // if ( iam == i) {
-		// printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees);
-		// fflush(stdout);
-	    // }
-	    // MPI_Barrier( grid->comm );
-	// }	
-	
-	// sleep(2.0); 	
+        if (!iam)
+            printf("\n.. FACT ops distribution:\n");
+        MPI_Gather(&ops[FACT], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm);
 
-	
-	MPI_Barrier( grid->comm );	
-	
-	if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
+        if (!iam)
+            for (i = 0; i < P; ++i)
+            {
+                printf("\t\t(%d)\t%e\n", i, ops1[i]);
+            }
+        fflush(stdout);
+        MPI_Barrier(grid->comm);
 
-    MPI_Gather(&utime[COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm);	
-    MPI_Gather(&utime[FACT], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm);	
-	if ( !iam ) 
-	for (i = 0; i < P; ++i) {
-		printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]);
-	}
-	fflush(stdout);
-	MPI_Barrier( grid->comm );	
-	
-	if ( !iam ) printf("\n.. FACT ops distribution:\n");
-    MPI_Gather(&ops[FACT], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm);
-	
-	if ( !iam ) 
-	for (i = 0; i < P; ++i) {
-		printf("\t\t(%d)\t%e\n", i, ops1[i]);
-	}
-	fflush(stdout);
-	MPI_Barrier( grid->comm );
-	
-	MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
 
-	if ( !iam ) {
-	    b = factflop/P/maxflop;
-	    printf("\tFACT load balance: %.2f\n", b);
-	}
-	fflush(stdout);
-	MPI_Barrier( grid->comm );
+        if (!iam)
+        {
+            b = factflop / P / maxflop;
+            printf("\tFACT load balance: %.2f\n", b);
+        }
+        fflush(stdout);
+        MPI_Barrier(grid->comm);
+
+        if (!iam)
+            printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n");
+
+        MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm);
+        MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm);
+        MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE, utime3, 1, MPI_DOUBLE, 0, grid->comm);
+        MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE, utime4, 1, MPI_DOUBLE, 0, grid->comm);
+        if (!iam)
+            for (i = 0; i < P; ++i)
+            {
+                printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i, utime1[i], utime2[i], utime3[i], utime4[i]);
+            }
+        fflush(stdout);
+        MPI_Barrier(grid->comm);
+
+        if (!iam)
+            printf("\n.. SOLVE ops distribution:\n");
+        MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm);
+        if (!iam)
+            for (i = 0; i < P; ++i)
+            {
+                printf("\t\t%d\t%e\n", i, ops1[i]);
+            }
+        MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+        if (!iam)
+        {
+            b = solveflop / P / maxflop;
+            printf("\tSOLVE load balance: %.2f\n", b);
+            fflush(stdout);
+        }
+    }
 
-	
-	if ( !iam ) printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n");
-
-    MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm);	
-    MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm);		
-    MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE,utime3, 1 , MPI_DOUBLE, 0, grid->comm);		
-    MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE,utime4, 1 , MPI_DOUBLE, 0, grid->comm);		
-	if ( !iam ) 	
-	for (i = 0; i < P; ++i) {
-		printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i,utime1[i],utime2[i],utime3[i], utime4[i]);
-	}
-	fflush(stdout); 
-	MPI_Barrier( grid->comm );	
-	
-	if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); 
-    MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm);	
-	if ( !iam ) 
-	for (i = 0; i < P; ++i) {
-		printf("\t\t%d\t%e\n", i, ops1[i]);
-	}
-	MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm);
-	if ( !iam ) {
-	    b = solveflop/P/maxflop;
-	    printf("\tSOLVE load balance: %.2f\n", b);
-		fflush(stdout);
-	}
-	
+    if (!iam)
+    {
+        SUPERLU_FREE(utime1);
+        SUPERLU_FREE(utime2);
+        SUPERLU_FREE(utime3);
+        SUPERLU_FREE(utime4);
+        SUPERLU_FREE(ops1);
     }
-	
-	if ( !iam ){
-	SUPERLU_FREE(utime1);
-	SUPERLU_FREE(utime2);
-	SUPERLU_FREE(utime3);
-	SUPERLU_FREE(utime4);
-	SUPERLU_FREE(ops1);
-	}
-	
+
 #endif
 
-/*  if ( !iam ) fflush(stdout);  CRASH THE SYSTEM pierre.  */
+    /*  if ( !iam ) fflush(stdout);  CRASH THE SYSTEM pierre.  */
 }
 
-void
-PStatFree(SuperLUStat_t *stat)
+void PStatFree(SuperLUStat_t *stat)
 {
     SUPERLU_FREE(stat->utime);
     SUPERLU_FREE(stat->ops);
@@ -678,13 +488,12 @@ PStatFree(SuperLUStat_t *stat)
 void ifill_dist(int_t *a, int_t alen, int_t ival)
 {
     register int_t i;
-    for (i = 0; i < alen; i++) a[i] = ival;
+    for (i = 0; i < alen; i++)
+        a[i] = ival;
 }
 
-
-void
-get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
-	       int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len)
+void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+                    int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len)
 {
     int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk;
     int_t *xsup;
@@ -692,99 +501,112 @@ get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
     i = j = *num_diag_procs = pkk = 0;
     nprow = grid->nprow;
     npcol = grid->npcol;
-    nsupers = Glu_persist->supno[n-1] + 1;
+    nsupers = Glu_persist->supno[n - 1] + 1;
     xsup = Glu_persist->xsup;
 
-    do {
-	++(*num_diag_procs);
-	i = (++i) % nprow;
-	j = (++j) % npcol;
-	pkk = PNUM( i, j, grid );
-    } while ( pkk != 0 ); /* Until wrap back to process 0 */
-    if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) )
-	ABORT("Malloc fails for diag_procs[]");
-    if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) )
-	ABORT("Calloc fails for diag_len[]");
-    for (i = j = k = 0; k < *num_diag_procs; ++k) {
-	pkk = PNUM( i, j, grid );
-	(*diag_procs)[k] = pkk;
-	i = (++i) % nprow;
-	j = (++j) % npcol;
+    do
+    {
+        ++(*num_diag_procs);
+        ++i;
+	i = (i) % nprow;
+        ++j;
+	j = (j) % npcol;
+        pkk = PNUM(i, j, grid);
+    } while (pkk != 0); /* Until wrap back to process 0 */
+    if (!(*diag_procs = intMalloc_dist(*num_diag_procs)))
+        ABORT("Malloc fails for diag_procs[]");
+    if (!(*diag_len = intCalloc_dist(*num_diag_procs)))
+        ABORT("Calloc fails for diag_len[]");
+    for (i = j = k = 0; k < *num_diag_procs; ++k)
+    {
+        pkk = PNUM(i, j, grid);
+        (*diag_procs)[k] = pkk;
+	++i;
+        i = (i) % nprow;
+	++j;
+        j = (j) % npcol;
     }
-    for (k = 0; k < nsupers; ++k) {
-	knsupc = SuperSize( k );
-	i = k % *num_diag_procs;
-	(*diag_len)[i] += knsupc;
+    for (k = 0; k < nsupers; ++k)
+    {
+        knsupc = SuperSize(k);
+        i = k % *num_diag_procs;
+        (*diag_len)[i] += knsupc;
     }
 }
 
-
 /*! \brief Get the statistics of the supernodes 
  */
 #define NBUCKS 10
-static 	int_t	max_sup_size;
+static int_t max_sup_size;
 
 void super_stats_dist(int_t nsuper, int_t *xsup)
 {
     register int_t nsup1 = 0;
-    int_t          i, isize, whichb, bl, bh;
-    int_t          bucket[NBUCKS];
+    int_t i, isize, whichb, bl, bh;
+    int_t bucket[NBUCKS];
 
     max_sup_size = 0;
 
-    for (i = 0; i <= nsuper; i++) {
-	isize = xsup[i+1] - xsup[i];
-	if ( isize == 1 ) nsup1++;
-	if ( max_sup_size < isize ) max_sup_size = isize;	
+    for (i = 0; i <= nsuper; i++)
+    {
+        isize = xsup[i + 1] - xsup[i];
+        if (isize == 1)
+            nsup1++;
+        if (max_sup_size < isize)
+            max_sup_size = isize;
     }
 
-    printf("    Supernode statistics:\n\tno of super = " IFMT "\n", nsuper+1);
+    printf("    Supernode statistics:\n\tno of super = " IFMT "\n", nsuper + 1);
     printf("\tmax supernode size = " IFMT "\n", max_sup_size);
     printf("\tno of size 1 supernodes = " IFMT "\n", nsup1);
 
     /* Histogram of the supernode sizes */
-    ifill_dist (bucket, NBUCKS, 0);
+    ifill_dist(bucket, NBUCKS, 0);
 
-    for (i = 0; i <= nsuper; i++) {
-        isize = xsup[i+1] - xsup[i];
-        whichb = (float) isize / max_sup_size * NBUCKS;
-        if (whichb >= NBUCKS) whichb = NBUCKS - 1;
+    for (i = 0; i <= nsuper; i++)
+    {
+        isize = xsup[i + 1] - xsup[i];
+        whichb = (float)isize / max_sup_size * NBUCKS;
+        if (whichb >= NBUCKS)
+            whichb = NBUCKS - 1;
         bucket[whichb]++;
     }
-    
+
     printf("\tHistogram of supernode sizes:\n");
-    for (i = 0; i < NBUCKS; i++) {
-        bl = (float) i * max_sup_size / NBUCKS;
-        bh = (float) (i+1) * max_sup_size / NBUCKS;
-        printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl+1, bh, bucket[i]);
+    for (i = 0; i < NBUCKS; i++)
+    {
+        bl = (float)i * max_sup_size / NBUCKS;
+        bh = (float)(i + 1) * max_sup_size / NBUCKS;
+        printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl + 1, bh, bucket[i]);
     }
-
 }
 
-
 /*! \brief Check whether repfnz[] == EMPTY after reset.
  */
 void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz)
 {
     int_t jj, k;
 
-    for (jj = jcol; jj < jcol+w; jj++) 
-	for (k = 0; k < n; k++)
-	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
-		fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n",
-			jj, k, repfnz[(jj-jcol)*n + k]);
-		ABORT("check_repfnz_dist");
-	    }
+    for (jj = jcol; jj < jcol + w; jj++)
+        for (k = 0; k < n; k++)
+            if (repfnz[(jj - jcol) * n + k] != EMPTY)
+            {
+                fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n",
+                        jj, k, repfnz[(jj - jcol) * n + k]);
+                ABORT("check_repfnz_dist");
+            }
 }
 
 void PrintInt10(char *name, int_t len, int_t *x)
 {
     register int_t i;
-    
+
     printf("%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) printf("\n\t[" IFMT "-" IFMT "]", i, i+9);
-	printf(IFMT, x[i]);
+    for (i = 0; i < len; ++i)
+    {
+        if (i % 10 == 0)
+            printf("\n\t[" IFMT "-" IFMT "]", i, i + 9);
+        printf(IFMT, x[i]);
     }
     printf("\n");
 }
@@ -792,11 +614,13 @@ void PrintInt10(char *name, int_t len, int_t *x)
 void PrintInt32(char *name, int len, int *x)
 {
     register int i;
-    
+
     printf("%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9);
-	printf("%6d", x[i]);
+    for (i = 0; i < len; ++i)
+    {
+        if (i % 10 == 0)
+            printf("\n\t[%2d-%2d]", i, i + 9);
+        printf("%6d", x[i]);
     }
     printf("\n");
 }
@@ -804,11 +628,13 @@ void PrintInt32(char *name, int len, int *x)
 int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x)
 {
     register int_t i;
-    
+
     fprintf(fp, "%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i+9);
-	fprintf(fp, IFMT, x[i]);
+    for (i = 0; i < len; ++i)
+    {
+        if (i % 10 == 0)
+            fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i + 9);
+        fprintf(fp, IFMT, x[i]);
     }
     fprintf(fp, "\n");
     return 0;
@@ -817,43 +643,50 @@ int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x)
 int file_PrintInt32(FILE *fp, char *name, int len, int *x)
 {
     register int i;
-    
+
     fprintf(fp, "%10s:", name);
-    for (i = 0; i < len; ++i) {
-	if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9);
-	fprintf(fp, "%6d", x[i]);
+    for (i = 0; i < len; ++i)
+    {
+        if (i % 10 == 0)
+            fprintf(fp, "\n\t[%2d-%2d]", i, i + 9);
+        fprintf(fp, "%6d", x[i]);
     }
     fprintf(fp, "\n");
     return 0;
 }
 
-int_t
-CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt)
+int_t CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt)
 {
     register int_t i, j, zd, numzd = 0;
 
-    for (j = 0; j < n; ++j) {
-	zd = 0;
-	for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) {
-	    /*if ( iperm[rowind[i]] == j ) zd = 1;*/
-	    if ( rowind[i] == j ) { zd = 1; break; }
-	}
-	if ( zd == 0 ) {
-#if ( PRNTlevel>=2 )
-	    printf(".. Diagonal of column %d is zero.\n", j);
+    for (j = 0; j < n; ++j)
+    {
+        zd = 0;
+        for (i = colbeg[j]; i < colbeg[j] + colcnt[j]; ++i)
+        {
+            /*if ( iperm[rowind[i]] == j ) zd = 1;*/
+            if (rowind[i] == j)
+            {
+                zd = 1;
+                break;
+            }
+        }
+        if (zd == 0)
+        {
+#if (PRNTlevel >= 2)
+            printf(".. Diagonal of column %d is zero.\n", j);
 #endif
-	    ++numzd;
-	}
+            ++numzd;
+        }
     }
 
     return numzd;
 }
 
-
 /* --------------------------------------------------------------------------- */
 void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
 {
-/*
+    /*
  * Purpose
  * =======
  * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing
@@ -872,34 +705,39 @@ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
  *          On entry, contains the array to be sorted.
  *          On exit, contains the sorted array.
  */
-  int_t IGAP, I, J;
-  int_t TEMP;
-  IGAP = N / 2;
-  while (IGAP > 0) {
-      for (I = IGAP; I < N; I++) {
-	  J = I - IGAP;
-	  while (J >= 0) {
-	      if (ARRAY1[J] > ARRAY1[J + IGAP]) {
-		  TEMP = ARRAY1[J];
-		  ARRAY1[J] = ARRAY1[J + IGAP];
-		  ARRAY1[J + IGAP] = TEMP;
-		  TEMP = ARRAY2[J];  
-		  ARRAY2[J] = ARRAY2[J + IGAP];
-		  ARRAY2[J + IGAP] = TEMP;
-		  J = J - IGAP;
-	      } else {
-		  break;
-	      }
-	  }
-      }
-      IGAP = IGAP / 2;
-  }
+    int_t IGAP, I, J;
+    int_t TEMP;
+    IGAP = N / 2;
+    while (IGAP > 0)
+    {
+        for (I = IGAP; I < N; I++)
+        {
+            J = I - IGAP;
+            while (J >= 0)
+            {
+                if (ARRAY1[J] > ARRAY1[J + IGAP])
+                {
+                    TEMP = ARRAY1[J];
+                    ARRAY1[J] = ARRAY1[J + IGAP];
+                    ARRAY1[J + IGAP] = TEMP;
+                    TEMP = ARRAY2[J];
+                    ARRAY2[J] = ARRAY2[J + IGAP];
+                    ARRAY2[J + IGAP] = TEMP;
+                    J = J - IGAP;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+        IGAP = IGAP / 2;
+    }
 }
 
-
 void isort1(int_t N, int_t *ARRAY)
 {
-/*
+    /*
  * Purpose
  * =======
  * Use quick sort algorithm to sort ARRAY in increasing order.
@@ -914,62 +752,72 @@ void isort1(int_t N, int_t *ARRAY)
  *          On exit, contains the sorted array.
  *
  */
-  int_t IGAP, I, J;
-  int_t TEMP;
-  IGAP = N / 2;
-  while (IGAP > 0) {
-      for (I = IGAP; I < N; I++) {
-	  J = I - IGAP;
-	  while (J >= 0) {
-	      if (ARRAY[J] > ARRAY[J + IGAP]) {
-		  TEMP = ARRAY[J];
-		  ARRAY[J] = ARRAY[J + IGAP];
-		  ARRAY[J + IGAP] = TEMP;
-		  J = J - IGAP;
-	      } else {
-		  break;
-	      }
-	  }
-      }
-      IGAP = IGAP / 2;
-  }
+    int_t IGAP, I, J;
+    int_t TEMP;
+    IGAP = N / 2;
+    while (IGAP > 0)
+    {
+        for (I = IGAP; I < N; I++)
+        {
+            J = I - IGAP;
+            while (J >= 0)
+            {
+                if (ARRAY[J] > ARRAY[J + IGAP])
+                {
+                    TEMP = ARRAY[J];
+                    ARRAY[J] = ARRAY[J + IGAP];
+                    ARRAY[J + IGAP] = TEMP;
+                    J = J - IGAP;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+        IGAP = IGAP / 2;
+    }
 }
 
 /* Only log the memory for the buffer space, excluding the LU factors */
-void log_memory(int64_t cur_bytes, SuperLUStat_t *stat) {
-    stat->current_buffer += (float) cur_bytes;
-    if (cur_bytes > 0) {
-	stat->peak_buffer = 
-	    SUPERLU_MAX(stat->peak_buffer, stat->current_buffer);
+void log_memory(int64_t cur_bytes, SuperLUStat_t *stat)
+{
+    stat->current_buffer += (float)cur_bytes;
+    if (cur_bytes > 0)
+    {
+        stat->peak_buffer =
+            SUPERLU_MAX(stat->peak_buffer, stat->current_buffer);
     }
 }
 
-void print_memorylog(SuperLUStat_t *stat, char *msg) {
+void print_memorylog(SuperLUStat_t *stat, char *msg)
+{
     printf("__ %s (MB):\n\tcurrent_buffer : %8.2f\tpeak_buffer : %8.2f\n",
-	   msg, stat->current_buffer, stat->peak_buffer);
+           msg, stat->current_buffer, stat->peak_buffer);
 }
 
-int compare_pair (const void *a, const void *b)
+int compare_pair(const void *a, const void *b)
 {
-    return (((struct superlu_pair *) a)->val - ((struct superlu_pair *) b)->val);
+    return (((struct superlu_pair *)a)->val - ((struct superlu_pair *)b)->val);
 }
 
 int get_thread_per_process()
-{   
-    char* ttemp; 
+{
+    char *ttemp;
     ttemp = getenv("THREAD_PER_PROCESS");
 
-    if(ttemp) return atoi(ttemp);
-    else return 1;
+    if (ttemp)
+        return atoi(ttemp);
+    else
+        return 1;
 }
 
-int_t
-get_max_buffer_size ()
+int_t get_max_buffer_size()
 {
     char *ttemp;
-    ttemp = getenv ("MAX_BUFFER_SIZE");
+    ttemp = getenv("MAX_BUFFER_SIZE");
     if (ttemp)
-        return atoi (ttemp);
+        return atoi(ttemp);
     else
         return 5000000;
 }
@@ -980,7 +828,7 @@ get_gpublas_nb ()
     char *ttemp;
     ttemp = getenv ("GPUBLAS_NB");
     if (ttemp)
-        return atoi (ttemp);
+        return atoi(ttemp);
     else
         return 64;
 }
@@ -991,13 +839,12 @@ get_num_gpu_streams ()
     char *ttemp;
     ttemp = getenv ("NUM_GPU_STREAMS");
     if (ttemp)
-        return atoi (ttemp);
+        return atoi(ttemp);
     else
         return 8;
 }
 
-int_t
-get_min (int_t * sums, int_t nprocs)
+int_t get_min(int_t *sums, int_t nprocs)
 {
     int_t min_ind, min_val;
     min_ind = 0;
@@ -1014,9 +861,8 @@ get_min (int_t * sums, int_t nprocs)
     return min_ind;
 }
 
-int_t
-static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition,
-		  int_t ldp, int_t * sums, int_t * counts, int nprocs)
+int_t static_partition(struct superlu_pair *work_load, int_t nwl, int_t *partition,
+                       int_t ldp, int_t *sums, int_t *counts, int nprocs)
 {
     //initialization loop
     for (int i = 0; i < nprocs; ++i)
@@ -1024,16 +870,15 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition,
         counts[i] = 0;
         sums[i] = 0;
     }
-    qsort (work_load, nwl, sizeof (struct superlu_pair), compare_pair);
+    qsort(work_load, nwl, sizeof(struct superlu_pair), compare_pair);
     // for(int i=0;i<nwl;i++)
     for (int i = nwl - 1; i >= 0; i--)
     {
-        int_t ind = get_min (sums, nprocs);
+        int_t ind = get_min(sums, nprocs);
         // printf("ind %d\n",ind );
         partition[ldp * ind + counts[ind]] = work_load[i].ind;
         counts[ind]++;
         sums[ind] += work_load[i].val;
-
     }
 
     return 0;
@@ -1042,19 +887,18 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition,
 /*
  * Search for the metadata of the j-th block in a U panel.
  */
-void
-arrive_at_ublock (int_t j,      /* j-th block in a U panel */
-                  int_t * iukp, /* output : point to index[] of j-th block */
-                  int_t * rukp, /* output : point to nzval[] of j-th block */ 
-		  int_t * jb,   /* Global block number of block U(k,j). */
-                  int_t * ljb,  /* Local block number of U(k,j). */
-                  int_t * nsupc,/* supernode size of destination block */
-                  int_t iukp0,  /* input : search starting point */
-                  int_t rukp0, 
-		  int_t * usub, /* U subscripts */
-                  int_t * perm_u, /* permutation vector from static schedule */
-                  int_t * xsup, /* for SuperSize and LBj */
-                  gridinfo_t * grid)
+void arrive_at_ublock(int_t j,      /* j-th block in a U panel */
+                      int_t *iukp,  /* output : point to index[] of j-th block */
+                      int_t *rukp,  /* output : point to nzval[] of j-th block */
+                      int_t *jb,    /* Global block number of block U(k,j). */
+                      int_t *ljb,   /* Local block number of U(k,j). */
+                      int_t *nsupc, /* supernode size of destination block */
+                      int_t iukp0,  /* input : search starting point */
+                      int_t rukp0,
+                      int_t *usub,   /* U subscripts */
+                      int_t *perm_u, /* permutation vector from static schedule */
+                      int_t *xsup,   /* for SuperSize and LBj */
+                      gridinfo_t *grid)
 {
     int_t jj;
     *iukp = iukp0; /* point to the first block in index[] */
@@ -1075,66 +919,70 @@ arrive_at_ublock (int_t j,      /* j-th block in a U panel */
 	 * usub[] - index array for panel U(k,:)
 	 */
         // printf("iukp %d \n",*iukp );
-        *jb = usub[*iukp];      /* Global block number of block U(k,jj). */
+        *jb = usub[*iukp]; /* Global block number of block U(k,jj). */
         // printf("jb %d \n",*jb );
-        *nsupc = SuperSize (*jb);
+        *nsupc = SuperSize(*jb);
         // printf("nsupc %d \n",*nsupc );
-        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *iukp += UB_DESCRIPTOR;   /* Start fstnz of block U(k,j). */
         *rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj);
-				     Move to block U(k,jj+1) in nzval[] */ 
+				     Move to block U(k,jj+1) in nzval[] */
         *iukp += *nsupc;
     }
 
     /* Set the pointers to the beginning of U block U(k,j) */
-    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
-    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
-    *nsupc = SuperSize (*jb);
-    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+    *jb = usub[*iukp];     /* Global block number of block U(k,j). */
+    *ljb = LBj(*jb, grid); /* Local block number of U(k,j). */
+    *nsupc = SuperSize(*jb);
+    *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
 }
 
-
 /*
  * Count the maximum size of U(kk,:) that I own locally.
  * September 28, 2016.
  * Modified December 4, 2018.
  */
-static int_t num_full_cols_U
-(
- int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
- gridinfo_t *grid, int_t *perm_u,
- int_t *ldu   /* max. segment size of nonzero columns in U(kk,:) */
+int_t num_full_cols_U(
+    int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup,
+    gridinfo_t *grid, int_t *perm_u,
+    int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */
 )
 {
-    int_t lk = LBi (kk, grid);
+    int_t lk = LBi(kk, grid);
     int_t *usub = Ufstnz_br_ptr[lk];
 
-    if (usub == NULL) return 0; /* code */
+    if (usub == NULL)
+        return 0; /* code */
 
-    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
-    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
-    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
-    
-    int_t klst = FstBlockC (kk + 1);
+    int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;         /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];    /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC(kk + 1);
     int_t iukp0 = iukp;
     int_t rukp0 = rukp;
-    int_t jb,ljb;
+    int_t jb, ljb;
     int_t nsupc;
     int_t full = 1;
     int_t full_Phi = 1;
     int_t temp_ncols = 0;
     int_t segsize;
 
-    for (int_t j = 0; j < nub; ++j) {
-        
-	/* Sherry -- no need to search from beginning ?? */
+    *ldu = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+
+        /* Sherry -- no need to search from beginning ?? */
         arrive_at_ublock(
-			 j, &iukp, &rukp, &jb, &ljb, &nsupc,
-			 iukp0, rukp0, usub, perm_u, xsup, grid
-			 );
-        for (int_t jj = iukp; jj < iukp + nsupc; ++jj) {
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid);
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
             segsize = klst - usub[jj];
-            if ( segsize ) ++temp_ncols;
-            if ( segsize > *ldu ) *ldu = segsize;
+            if (segsize)
+                ++temp_ncols;
+            if (segsize > *ldu)
+                *ldu = segsize;
         }
     }
     return temp_ncols;
@@ -1142,20 +990,20 @@ static int_t num_full_cols_U
 
 int_t estimate_bigu_size(
       int_t nsupers,
-      int_t**Ufstnz_br_ptr, /* point to U index[] array */
+      int_t **Ufstnz_br_ptr, /* point to U index[] array */
       Glu_persist_t *Glu_persist,
       gridinfo_t* grid, int_t* perm_u, 
       int_t *max_ncols /* Output: Max. number of columns among all U(k,:).
-			     This is used for allocating GEMM V buffer.  */
-			 )
+			  This is used for allocating GEMM V buffer.  */
+)
 {
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
     int_t Pr = grid->nprow;
-    int_t myrow = MYROW (iam, grid);
-    int_t mycol = MYCOL (iam, grid);
-    
-    int_t* xsup = Glu_persist->xsup;
+    int_t myrow = MYROW(iam, grid);
+    int_t mycol = MYCOL(iam, grid);
+
+    int_t *xsup = Glu_persist->xsup;
 
     int_t ncols = 0; /* Count local number of nonzero columns */
     int_t ldu = 0;   /* Count max. segment size in one row U(k,:) */
@@ -1163,12 +1011,14 @@ int_t estimate_bigu_size(
     int_t max_ldu = 0;
 
     /* Initialize perm_u */
-    for (int i = 0; i < nsupers; ++i) perm_u[i] = i;
+    for (int i = 0; i < nsupers; ++i)
+        perm_u[i] = i;
 
-    for (int lk = myrow; lk < nsupers; lk += Pr) {/* Go through my block rows */
+    for (int lk = myrow; lk < nsupers; lk += Pr)
+    { /* Go through my block rows */
         ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
-						   xsup, grid, perm_u, &ldu) );
-	my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu);
+                                                   xsup, grid, perm_u, &ldu));
+        my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu);
     }
 #if 0
 	my_max_ldu = my_max_ldu*8;  //YL: 8 is a heuristic number  
@@ -1179,116 +1029,247 @@ int_t estimate_bigu_size(
     MPI_Allreduce(&my_max_ldu, &max_ldu, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
     MPI_Allreduce(&ncols, max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
 
-#if ( PRNTlevel>=1 )
-    if ( iam==0 ) {
-	printf("max_ncols " IFMT ",  max_ldu " IFMT ", bigu_size " IFMT "\n",
-	       *max_ncols, max_ldu, max_ldu * (*max_ncols));
-	fflush(stdout);
+#if (PRNTlevel >= 1)
+    if (iam == 0)
+    {
+        printf("max_ncols " IFMT ",  max_ldu " IFMT ", bigu_size " IFMT "\n",
+               *max_ncols, max_ldu, max_ldu * (*max_ncols));
+        fflush(stdout);
     }
 #endif
 
-    return(max_ldu * (*max_ncols));
+    return (max_ldu * (*max_ncols));
 }
 
-void quickSort( int_t* a, int_t l, int_t r, int_t dir)
+void quickSort(int_t *a, int_t l, int_t r, int_t dir)
 {
-   int_t j;
-
-   if( l < r ) 
-   {
-   	// divide and conquer
-       j = partition( a, l, r, dir);
-       quickSort( a, l, j-1, dir);
-       quickSort( a, j+1, r, dir);
-   }
-	
+    int_t j;
+
+    if (l < r)
+    {
+        // divide and conquer
+        j = partition(a, l, r, dir);
+        quickSort(a, l, j - 1, dir);
+        quickSort(a, j + 1, r, dir);
+    }
 }
 
-int_t partition( int_t* a, int_t l, int_t r, int_t dir) {
-   int_t pivot, i, j, t;
-   pivot = a[l];
-   i = l; j = r+1;
-   
-   if(dir==0){		
-	   while( 1)
-	   {
-		do ++i; while( a[i] <= pivot && i <= r );
-		do --j; while( a[j] > pivot );
-		if( i >= j ) break;
-		t = a[i]; a[i] = a[j]; a[j] = t;
-	   }
-	   t = a[l]; a[l] = a[j]; a[j] = t;
-	   return j;
-   }else if(dir==1){
-	   while( 1)
-	   {
-		do ++i; while( a[i] >= pivot && i <= r );
-		do --j; while( a[j] < pivot );
-		if( i >= j ) break;
-		t = a[i]; a[i] = a[j]; a[j] = t;
-	   }
-	   t = a[l]; a[l] = a[j]; a[j] = t;
-	   return j;	   
-   }
+int_t partition(int_t *a, int_t l, int_t r, int_t dir)
+{
+    int_t pivot, i, j, t;
+    pivot = a[l];
+    i = l;
+    j = r + 1;
+
+    if (dir == 0)
+    {
+        while (1)
+        {
+            do
+                ++i;
+            while (a[i] <= pivot && i <= r);
+            do
+                --j;
+            while (a[j] > pivot);
+            if (i >= j)
+                break;
+            t = a[i];
+            a[i] = a[j];
+            a[j] = t;
+        }
+        t = a[l];
+        a[l] = a[j];
+        a[j] = t;
+        return j;
+    }
+    else if (dir == 1)
+    {
+        while (1)
+        {
+            do
+                ++i;
+            while (a[i] >= pivot && i <= r);
+            do
+                --j;
+            while (a[j] < pivot);
+            if (i >= j)
+                break;
+            t = a[i];
+            a[i] = a[j];
+            a[j] = t;
+        }
+        t = a[l];
+        a[l] = a[j];
+        a[j] = t;
+        return j;
+    }
+    return 0;
 }
 
+void quickSortM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims)
+{
+    int_t j;
+
+    if (l < r)
+    {
+        // printf("dims: %5d",dims);
+        // fflush(stdout);
 
+        // divide and conquer
+        j = partitionM(a, l, r, lda, dir, dims);
+        quickSortM(a, l, j-1, lda, dir, dims);
+        quickSortM(a, j+1, r, lda, dir, dims);
+    }
+}
 
-void quickSortM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims)
+int_t partitionM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims)
 {
-   int_t j;
-
-   if( l < r ) 
-   {
-	   	// printf("dims: %5d",dims);
-		// fflush(stdout);
-		
-   	// divide and conquer
-       j = partitionM( a, l, r,lda,dir, dims);
-       quickSortM( a, l, j-1,lda,dir,dims);
-       quickSortM( a, j+1, r,lda,dir,dims);
-   }
-	
+    int_t pivot, i, j, t, dd;
+    pivot = a[l];
+    i = l;
+    j = r + 1;
+
+    if (dir == 0)
+    {
+        while (1)
+        {
+            do
+                ++i;
+            while (a[i] <= pivot && i <= r);
+            do
+                --j;
+            while (a[j] > pivot);
+            if (i >= j)
+                break;
+            for (dd = 0; dd < dims; dd++)
+            {
+                t = a[i + lda * dd];
+                a[i + lda * dd] = a[j + lda * dd];
+                a[j + lda * dd] = t;
+            }
+        }
+        for (dd = 0; dd < dims; dd++)
+        {
+            t = a[l + lda * dd];
+            a[l + lda * dd] = a[j + lda * dd];
+            a[j + lda * dd] = t;
+        }
+        return j;
+    }
+    else if (dir == 1)
+    {
+        while (1)
+        {
+            do
+                ++i;
+            while (a[i] >= pivot && i <= r);
+            do
+                --j;
+            while (a[j] < pivot);
+            if (i >= j)
+                break;
+            for (dd = 0; dd < dims; dd++)
+            {
+                t = a[i + lda * dd];
+                a[i + lda * dd] = a[j + lda * dd];
+                a[j + lda * dd] = t;
+            }
+        }
+        for (dd = 0; dd < dims; dd++)
+        {
+            t = a[l + lda * dd];
+            a[l + lda * dd] = a[j + lda * dd];
+            a[j + lda * dd] = t;
+        }
+        return j;
+    }
+
+    return 0;
+} /* partitionM */
+
+int_t **getTreePerm(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
+                    int_t *nodeCount, int_t **nodeList,
+                    int_t *perm_c_supno, int_t *iperm_c_supno,
+                    gridinfo3d_t *grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+    int_t **treePerm = SUPERLU_MALLOC(sizeof(int_t *) * maxLvl);
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+    {
+        // treePerm[lvl] = NULL;
+        int_t treeId = myTreeIdxs[lvl];
+        treePerm[lvl] = getPermNodeList(nodeCount[treeId], nodeList[treeId],
+                                        perm_c_supno, iperm_c_supno);
+    }
+    return treePerm;
+}
+
+int_t *getMyNodeCounts(int_t maxLvl, int_t *myTreeIdxs, int_t *gNodeCount)
+{
+    int_t *myNodeCount = INT_T_ALLOC(maxLvl);
+    for (int i = 0; i < maxLvl; ++i)
+    {
+        myNodeCount[i] = gNodeCount[myTreeIdxs[i]];
+    }
+    return myNodeCount;
 }
 
+/*chekc a vector vec of len across different process grids*/
+int_t checkIntVector3d(int_t *vec, int_t len, gridinfo3d_t *grid3d)
+{
+    int_t nP = grid3d->zscp.Np;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t *buf = intMalloc_dist(len);
 
-int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) {
-   int_t pivot, i, j, t, dd;
-   pivot = a[l];
-   i = l; j = r+1;
-
-	if(dir==0){
-	   while( 1)
-	   {
-		do ++i; while( a[i] <= pivot && i <= r );
-		do --j; while( a[j] > pivot );
-		if( i >= j ) break; 
-		for(dd=0;dd<dims;dd++){	
-			t = a[i+lda*dd]; a[i+lda*dd] = a[j+lda*dd]; a[j+lda*dd] = t;	
-		}
-	   }
-	   for(dd=0;dd<dims;dd++){	
-		t = a[l+lda*dd]; a[l+lda*dd] = a[j+lda*dd]; a[j+lda*dd] = t;
-	   }	   
-	   return j;		
-	}else if(dir==1){
-	   while( 1)
-	   {
-		do ++i; while( a[i] >= pivot && i <= r );
-		do --j; while( a[j] < pivot );
-		if( i >= j ) break;
-		for(dd=0;dd<dims;dd++){	
-			t = a[i+lda*dd]; a[i+lda*dd] = a[j+lda*dd]; a[j+lda*dd] = t;	
-		}
-	   }
-	   for(dd=0;dd<dims;dd++){	
-		t = a[l+lda*dd]; a[l+lda*dd] = a[j+lda*dd]; a[j+lda*dd] = t;
-	   } 
-	   return j;		
-	}
+    if (!myGrid)
+    {
+        for (int_t p = 1; p < nP; ++p)
+        {
+            MPI_Status status;
+            MPI_Recv(buf, len, mpi_int_t, p, p, grid3d->zscp.comm, &status);
+
+            for (int_t i = 0; i < len; ++i)
+            {
+                /* code */
+                if (buf[i] != vec[i])
+                {
+                    /* code */
+                    printf("Error occured at (%d) Loc %d \n", (int)p, (int)i);
+                    exit(0);
+                }
+            }
+        }
+    }
+    else
+    {
+        MPI_Send(vec, len, mpi_int_t, 0, myGrid, grid3d->zscp.comm);
+    }
+
+    return 0;
 }
 
+/**
+ * reduce the states from all the two grids before prinitng it out
+ * See the defenition of enum PhaseType in superlu_enum_const.h
+ */
+int_t reduceStat(PhaseType PHASE,
+                 SuperLUStat_t *stat, gridinfo3d_t *grid3d)
+{
+    flops_t *ops = stat->ops;
 
+    flops_t flopcnt;
+    MPI_Reduce(&ops[PHASE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid3d->zscp.comm);
+
+    if (!grid3d->zscp.Iam)
+    {
+        ops[PHASE] = flopcnt;
+    }
+
+    return 0;
+}
+
+/*---- end from 3D code p3dcomm.c ----*/
 
 
 #ifdef GPU_ACC
@@ -1525,3 +1506,36 @@ gemm_division_new (int * num_streams_used,   /*number of streams that will be us
 }
 
 #endif  /* defined GPU_ACC */
+
+/* The following are moved from superlu_gpu.cu */
+
+int getnGPUStreams()
+{
+    // Disabling multiple gpu streams 
+    #if 1
+	return 1;
+    #else 
+	char *ttemp;
+	ttemp = getenv ("NUM_GPU_STREAMS");
+
+	if (ttemp)
+		return atoi (ttemp);
+	else
+		return 1;
+    #endif 
+}
+
+int get_mpi_process_per_gpu ()
+{
+    char *ttemp;
+    ttemp = getenv ("MPI_PROCESS_PER_GPU");
+
+	if (ttemp)
+		return atol (ttemp);
+	else
+	{
+		printf("MPI_PROCESS_PER_GPU is not set; Using default 1 \n");
+		return 1;
+	}
+}
+
diff --git a/SRC/util_dist.h b/SRC/util_dist.h
index 93dc5640..a52b7c1f 100644
--- a/SRC/util_dist.h
+++ b/SRC/util_dist.h
@@ -19,6 +19,7 @@ at the top-level directory.
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+
 #include "superlu_enum_consts.h"
 
 /*
@@ -56,7 +57,11 @@ at the top-level directory.
 #define SUPERLU_MAX(x, y) 	( (x) > (y) ? (x) : (y) )
 #define SUPERLU_MIN(x, y) 	( (x) < (y) ? (x) : (y) )
 
-    
+// allocating macros
+#define MPI_REQ_ALLOC(x)  ((MPI_Request *) SUPERLU_MALLOC ( (x) * sizeof (MPI_Request)))
+#define INT_T_ALLOC(x)  ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
+#define DOUBLE_ALLOC(x)  ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
+
 /* 
  * Constants 
  */
@@ -68,6 +73,13 @@ at the top-level directory.
 #define TRUE	(1)
 #endif
 
+/*==== For 3D code ====*/
+#define MAX_3D_LEVEL 32 /*allows for z dimensions of 2^32*/
+#define CBLOCK 192
+#define CACHE_LINE_SIZE 8
+#define CSTEPPING 8
+/*=====================*/
+
 /*
  * Type definitions
  */
@@ -150,4 +162,132 @@ typedef struct {
 #define SuperLU_U_NZ_START(col)      ( Ustore->colptr[col] )
 #define SuperLU_U_SUB(ptr)           ( Ustore->rowind[ptr] )
 
+/***********************************************************************
+ * For 3D code */
+/* SCT_t was initially Schur-complement counter to compute different 
+   metrics of Schur-complement Update.
+   Later, it includes counters to keep track of many other metrics.
+*/
+typedef struct
+{
+    int_t datatransfer_count;
+    int_t schurPhiCallCount;
+    int_t PhiMemCpyCounter;
+    double acc_load_imbal;
+    double LookAheadGEMMFlOp;
+    double PhiWaitTimer_2;
+    double LookAheadGEMMTimer;
+    double LookAheadRowSepTimer;
+    double LookAheadScatterTimer;
+    double GatherTimer ;
+    double GatherMOP ;
+    double scatter_mem_op_counter;
+    double LookAheadRowSepMOP  ;
+    double scatter_mem_op_timer;
+    double schur_flop_counter;
+    double schur_flop_timer;
+    double CPUOffloadTimer;
+    double PhiWaitTimer;
+    double NetSchurUpTimer;
+    double AssemblyTimer;
+    double PhiMemCpyTimer;
+    double datatransfer_timer;
+    double LookAheadScatterMOP;
+    double schurPhiCallTimer;
+    double autotunetime;
+    double *Predicted_acc_sch_time;
+    double *Predicted_acc_gemm_time;
+    double *Predicted_acc_scatter_time;
+
+    double trf2_flops;
+    double trf2_time;
+    double offloadable_flops;   /*flops that can be done on ACC*/
+    double offloadable_mops;    /*mops that can be done on ACC*/
+
+    double *SchurCompUdtThreadTime;
+    double *Predicted_host_sch_time;
+    double *Measured_host_sch_time;
+
+#ifdef SCATTER_PROFILE
+    double *Host_TheadScatterMOP ;
+    double *Host_TheadScatterTimer;
+#endif
+
+#ifdef OFFLOAD_PROFILE
+    double *Predicted_acc_scatter_time_strat1;
+    double *Predicted_host_sch_time_strat1;
+    size_t pci_transfer_count[18];  /*number of transfers*/
+    double pci_transfer_time[18];   /*time for each transfer */
+    double pci_transfer_prediction_error[18];   /*error in prediction*/
+    double host_sch_time[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/
+    double host_sch_flop[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/
+#endif
+
+    double pdgstrs2_timer;
+    double pdgstrf2_timer;
+    double lookaheadupdatetimer;
+    double pdgstrfTimer;
+
+// new timers for different wait times
+    //convention:  tl suffix  refers to times measured from rdtsc
+    // td : suffix refers to times measured in SuerpLU_timer
+
+    /* diagonal block factorization; part of pdgstrf2; called from thread*/
+    // double Local_Dgstrf2_tl; 
+    double *Local_Dgstrf2_Thread_tl;      
+    /*wait for receiving U diagonal block: part of mpf*/
+    double Wait_UDiagBlock_Recv_tl;
+    /*wait for receiving L diagonal block: part of mpf*/
+    double Wait_LDiagBlock_Recv_tl;
+    
+
+    /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */
+    double Recv_UDiagBlock_tl;
+    /*wait for previous U block send to finish; part of pdgstrf2 */
+    double Wait_UDiagBlockSend_tl;  
+    /*after obtaining U block, time spent in calculating L panel*/
+    double L_PanelUpdate_tl;
+    /*Synchronous Broadcasting L and U panel*/
+    double Bcast_UPanel_tl;
+    double Bcast_LPanel_tl;
+    /*Wait for L send to finish */
+    double Wait_LSend_tl;
+
+    /*Wait for U send to finish */
+    double Wait_USend_tl;
+    /*Wait for U receive */
+    double Wait_URecv_tl;
+    /*Wait for L receive */
+    double Wait_LRecv_tl;
+
+    /*time to get lock*/
+    double *GetAijLock_Thread_tl;
+
+    /*U_panelupdate*/
+    double PDGSTRS2_tl;
+
+    /*profiling by phases */
+    double Phase_Factor_tl;
+    double Phase_LU_Update_tl;
+    double Phase_SC_Update_tl;
+    
+    /*3D timers*/
+    double ancsReduce;  /*timer for reducing ancestors before factorization*/
+    double gatherLUtimer; /*timer for gather LU factors into bottom layer*/
+    double tFactor3D[MAX_3D_LEVEL];
+    double tSchCompUdt3d[MAX_3D_LEVEL];
+
+    /*ASync Profiler timing*/
+    double tAsyncPipeTail;
+
+    /*double t_Startup time before factorization starts*/
+    double tStartup;
+
+    /*keeping track of data sent*/
+    double commVolFactor;
+    double commVolRed;
+
+} SCT_t;
+
+
 #endif /* __SUPERLU_UTIL */
diff --git a/SRC/zHWPM_CombBLAS.hpp b/SRC/zHWPM_CombBLAS.hpp
index 7860e6a5..f45ebfed 100644
--- a/SRC/zHWPM_CombBLAS.hpp
+++ b/SRC/zHWPM_CombBLAS.hpp
@@ -73,7 +73,7 @@ zGetHWPM(SuperMatrix *A, gridinfo_t *grid, zScalePermstruct_t *ScalePermstruct)
     {
         printf("HWPM only supports square process grid. Retuning without a permutation.\n");
     }
-    combblas::SpParMat < int_t, double, combblas::SpDCCols<int_t,double> > Adcsc;
+    combblas::SpParMat < int_t, double, combblas::SpDCCols<int_t,double> > Adcsc(grid->comm);
     std::vector< std::vector < std::tuple<int_t,int_t,double> > > data(procs);
     
     /* ------------------------------------------------------------
diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c
index e2957379..4c670505 100644
--- a/SRC/zbinary_io.c
+++ b/SRC/zbinary_io.c
@@ -18,6 +18,7 @@ zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz,
     nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp);
     printf("# of doubles fread: %d\n", nnz_read);
     fclose(fp);
+    return 0;
 }
 
 int
@@ -27,7 +28,7 @@ zwrite_binary(int_t n, int_t nnz,
     FILE  *fp1;
     int nnz_written;
     size_t isize = sizeof(int_t), dsize = sizeof(double);
-    fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb");
+    fp1 = fopen("cmatrix.bin", "wb");
     fwrite(&n, isize, 1, fp1);
     fwrite(&nnz, isize, 1, fp1);
     fwrite(colptr, isize, n+1, fp1);
@@ -37,4 +38,5 @@ zwrite_binary(int_t n, int_t nnz,
     printf("dump binary file ... # of doubles fwrite: %d\n", nnz_written);
     assert(nnz_written == 2*nnz);
     fclose(fp1);
+    return 0;
 }
diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c
new file mode 100644
index 00000000..55f9c435
--- /dev/null
+++ b/SRC/zcommunication_aux.c
@@ -0,0 +1,502 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Communication routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_zdefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t zIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t zIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     doublecomplex* uval, gridinfo_t *grid,
+		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, doublecomplex* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, zLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t zIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, doublecomplex* Uval_buf, zLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t zWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t zISend_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, zLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(doublecomplex) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t zIBcast_UDiagBlock(int_t k, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t zIBcast_LDiagBlock(int_t k, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t zUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/zgather.c b/SRC/zgather.c
new file mode 100644
index 00000000..46f80f87
--- /dev/null
+++ b/SRC/zgather.c
@@ -0,0 +1,397 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Various gather routines.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include <stdio.h>
+#include "superlu_zdefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t zprintMatrix(char*s, int n, int m, doublecomplex* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; i<n; i++)
+    {
+        for(int j =0; j<m; j++)
+        {
+            printf("%g %g\n", A[j*LDA +i].r, A[j*LDA +i].r);
+        }
+        printf("\n");
+    }
+    return 0;
+}
+
+void zgather_u(int_t num_u_blks,
+                Ublock_info_t *Ublock_info, int_t * usub,
+              	doublecomplex *uval, doublecomplex *bigU, int_t ldu,
+              	int_t *xsup, int_t klst                /* for SuperSize */
+             )
+{
+    // return;
+    //  private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,lead_zero, \
+    // jj, i)
+    doublecomplex zero = {0.0, 0.0};
+
+#ifdef _OPENMP    
+#pragma omp parallel for default (shared) schedule(dynamic)
+#endif
+    for (int_t j = 0; j < num_u_blks; ++j)
+    {
+        doublecomplex *tempu;
+        if (j == 0) tempu = bigU;
+        else tempu = bigU + ldu * Ublock_info[j - 1].full_u_cols;
+
+        int_t iukp = Ublock_info[j].iukp ;
+        int_t rukp = Ublock_info[j].rukp ;
+        int_t jb = Ublock_info[j].jb ;
+        int_t nsupc = SuperSize(jb);
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            int_t segsize = klst - usub[jj];
+            if ( segsize )
+            {
+                int_t lead_zero = ldu - segsize;
+                for (int_t i = 0; i < lead_zero; ++i) tempu[i] = zero;
+                tempu += lead_zero;
+                for (int_t i = 0; i < segsize; ++i)
+                {
+                    // printf("%d %d %d\n",i,rukp,segsize );
+                    tempu[i] = uval[rukp + i];
+                }
+                rukp += segsize;
+                tempu += segsize;
+            }
+        }
+        rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
+    }
+}
+
+
+void zgather_l( int_t num_LBlk, int_t knsupc,
+               Remain_info_t *L_info,
+               doublecomplex * lval, int_t LD_lval, doublecomplex * L_buff )
+{
+    if (num_LBlk < 1)
+    {
+        return;
+    }
+
+    int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
+#pragma omp parallel for
+#endif
+    for (int_t i = 0; i < num_LBlk; ++i)
+    {
+        int_t StRowDest  = 0;
+        int_t temp_nbrow;
+        if (i == 0)
+        {
+            temp_nbrow = L_info[0].FullRow;
+        }
+        else
+        {
+            StRowDest   = L_info[i - 1].FullRow;
+            temp_nbrow  = L_info[i].FullRow - L_info[i - 1].FullRow;
+        }
+
+        int_t StRowSource = L_info[i].StRow;
+#if 0
+        LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', temp_nbrow, knsupc, &lval[StRowSource], LD_lval, &L_buff[StRowDest], LD_LBuff);
+#else  /* Sherry */
+	for (int j = 0; j < knsupc; ++j) {
+            memcpy( &L_buff[StRowDest + j * LD_LBuff], 
+	            &lval[StRowSource + j * LD_lval],
+	            temp_nbrow * sizeof(doublecomplex) );
+        }
+#endif
+    } /* end for i ... */
+}
+
+// Rearragnes and gatehrs L blocks
+void zRgather_L( int_t k, int_t *lsub, doublecomplex *lusup,
+                gEtreeInfo_t* gEtreeInfo, Glu_persist_t *Glu_persist,
+                gridinfo_t *grid, HyP_t *HyP, int_t *myIperm, int_t *iperm_c_supno )
+{
+
+    int_t temp_nbrow;
+    int_t cum_nrow = 0;
+    int_t ib;
+    int_t *xsup = Glu_persist->xsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    zgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    zgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // zprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // zprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void zRgather_U( int_t k, int_t jj0, int_t *usub,	doublecomplex *uval,
+                 doublecomplex *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    zgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    zgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* zRgather_U */
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
new file mode 100644
index 00000000..4c97a5b1
--- /dev/null
+++ b/SRC/zlustruct_gpu.h
@@ -0,0 +1,236 @@
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * </pre>
+ */
+
+#pragma once // so that this header file is included onle once
+
+#include "superlu_zdefs.h"
+
+#ifdef GPU_ACC // enable GPU
+#include "gpublas_utils.h"
+// #include "mkl.h"
+// #include "sec_structs.h"
+// #include "supernodal_etree.h"
+
+/* Constants */
+//#define SLU_TARGET_GPU 0
+//#define MAX_BLOCK_SIZE 10000
+#define MAX_NGPU_STREAMS 32
+
+static
+void check(gpuError_t result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
+                file, line, gpuGetErrorString(result), func);
+
+        // Make sure we call GPU Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define checkGPUErrors(val)  check ( (val), #val, __FILE__, __LINE__ )
+
+typedef struct //SCUbuf_gpu_
+{
+    /*Informations for various buffers*/
+    doublecomplex *bigV;
+    doublecomplex *bigU;
+    doublecomplex *bigU_host;      /*pinned location*/
+    int_t *indirect;        /*for indirect address calculations*/
+    int_t *indirect2;       /*for indirect address calculations*/
+
+    doublecomplex *Remain_L_buff;  /* on GPU */
+    doublecomplex *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
+    
+    int_t *lsub;
+    int_t *usub;
+
+    int_t *lsub_buf, *usub_buf;
+    
+    Ublock_info_t *Ublock_info; /* on GPU */
+    Remain_info_t *Remain_info;
+    Ublock_info_t *Ublock_info_host;
+    Remain_info_t *Remain_info_host;
+
+    int_t* usub_IndirectJ3;  /* on GPU */
+    int_t* usub_IndirectJ3_host;
+
+} zSCUbuf_gpu_t;
+
+/* Holds the L & U data structures on the GPU side */
+typedef struct //LUstruct_gpu_ 
+{
+    int_t   *LrowindVec;      /* A single vector */
+    int_t   *LrowindPtr;      /* A single vector */
+
+    doublecomplex  *LnzvalVec;       /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
+
+    int_t   *UrowindVec;            /* A single vector */
+    int_t   *UrowindPtr;            /* A single vector */
+    int_t   *UrowindPtr_host;       /* A single vector */
+    int_t   *UnzvalPtr_host;
+
+    doublecomplex  *UnzvalVec;       /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
+    /*gpu pointers for easy block accesses */
+    local_l_blk_info_t *local_l_blk_infoVec;
+    int_t *local_l_blk_infoPtr;
+    int_t *jib_lookupVec;
+    int_t *jib_lookupPtr;
+    local_u_blk_info_t *local_u_blk_infoVec;
+
+    int_t *local_u_blk_infoPtr;
+    int_t *ijb_lookupVec;
+    int_t *ijb_lookupPtr;
+
+    // GPU buffers for performing Schur Complement Update on GPU
+    zSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
+    doublecomplex *acc_L_buff, *acc_U_buff;
+
+    /*Informations for various buffers*/
+    int_t buffer_size;      /**/
+    int_t nsupers;  /*should have number of supernodes*/
+    int_t *xsup;
+    gridinfo_t *grid;
+
+    double ScatterMOPCounter;
+    double ScatterMOPTimer;
+    double GemmFLOPCounter;
+    double GemmFLOPTimer;
+
+    double cPCIeH2D;
+    double cPCIeD2H;
+    double tHost_PCIeH2D;
+    double tHost_PCIeD2H;
+
+    /*gpu events to measure DGEMM and SCATTER timing */
+    int *isOffloaded;  /*stores if any iteration is offloaded or not*/
+    gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd;  /*gpu events to store gemm and scatter's begin and end*/
+    gpuEvent_t *ePCIeH2D;
+    gpuEvent_t *ePCIeD2H_Start;
+    gpuEvent_t *ePCIeD2H_End;
+
+    int_t *xsup_host;
+    int_t* perm_c_supno;
+    int_t first_l_block_gpu, first_u_block_gpu;
+} zLUstruct_gpu_t;
+
+typedef struct //sluGPU_t_
+{
+    int_t gpuId;        // if there are multiple GPUs
+    zLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
+    gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
+    gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
+    int_t lastOffloadStream[MAX_NGPU_STREAMS];
+    int_t nGPUStreams;
+    int* isNodeInMyGrid;
+    double acc_async_cost;
+} zsluGPU_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int zsparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    zscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    zLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    zdiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    zsluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, 
+    SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info);
+
+int zinitD2Hreduce(
+    int next_k,
+    d2Hreduce_t* d2Hred,
+    int last_flag,
+    // int_t *perm_c_supno,
+    HyP_t* HyP,
+    zsluGPU_t *sluGPU,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct, SCT_t* SCT
+);
+
+extern int zreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
+   	zsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
+	 zLUstruct_t *LUstruct);
+
+extern int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT);
+extern int zsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, zsluGPU_t *sluGPU);
+extern int zsendSCUdataHost2GPU(
+    int_t streamId, int_t* lsub, int_t* usub, doublecomplex* bigU, int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,  zsluGPU_t *sluGPU, HyP_t* HyP
+);
+
+extern int zinitSluGPU3D_t(
+    zsluGPU_t *sluGPU,
+    zLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt
+);
+int zSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    zsluGPU_t *sluGPU, gridinfo_t *grid
+);
+
+
+extern void zCopyLUToGPU3D (int* isNodeInMyGrid, zLocalLU_t *A_host,
+           zsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
+	   gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt);
+
+extern int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+                              int_t** treePerm,    zLUValSubBuf_t*LUvsb,
+                              zLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
+                              zsluGPU_t *sluGPU,  d2Hreduce_t* d2Hred,
+                              factStat_t *factStat, HyP_t* HyP, SCT_t* SCT );
+
+extern void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT);
+extern int zfree_LUstruct_gpu (zLUstruct_gpu_t *A_gpu);
+
+//int freeSluGPU(zsluGPU_t *sluGPU);
+
+extern void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex *dA, int_t lda );
+
+/*to print out various statistics*/
+void zprintGPUStats(zLUstruct_gpu_t *A_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // matching: enable GPU
diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c
index 38a08080..edcaf944 100644
--- a/SRC/zmemory_dist.c
+++ b/SRC/zmemory_dist.c
@@ -169,8 +169,6 @@ doublecomplex *doublecomplexCalloc_dist(int_t n)
     return (buf);
 }
 
-#if 0 ///////// Sherry
-
 /***************************************
  * The following are from 3D code.
  ***************************************/
@@ -285,4 +283,3 @@ void z3D_printMemUse( trf3Dpartition_t*  trf3Dpartition,  zLUstruct_t *LUstruct,
     }
 }
 
-#endif
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
new file mode 100644
index 00000000..6647d933
--- /dev/null
+++ b/SRC/znrformat_loc3d.c
@@ -0,0 +1,574 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_zdefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void zGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ doublecomplex *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (doublecomplex *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	doublecomplex *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	doublecomplex *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, B1,
+		    b_counts_int, b_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* zGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int zScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    doublecomplex *B = (doublecomplex *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    doublecomplex *B2d = (doublecomplex *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    doublecomplex *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    doublecomplex *Btmp; // on 3D grid
+    Btmp = doublecomplexMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX,
+		     Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, SuperLU_MPI_DOUBLE_COMPLEX,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* zScatter_B3d */
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
index 526641aa..993de064 100644
--- a/SRC/zreadMM.c
+++ b/SRC/zreadMM.c
@@ -60,7 +60,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
@@ -107,7 +107,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 3/ Read n and nnz */
 #ifdef _LONGINT
-    sscanf(line, "%ld%ld%ld",m, n, nonz);
+    sscanf(line, "%lld%lld%lld", m, n, nonz);
 #else
     sscanf(line, "%d%d%d",m, n, nonz);
 #endif
diff --git a/SRC/zreadtriple.c b/SRC/zreadtriple.c
index a52eae5f..d8e4c9a2 100644
--- a/SRC/zreadtriple.c
+++ b/SRC/zreadtriple.c
@@ -46,7 +46,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
 #ifdef _LONGINT
-    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+    fscanf(fp, "%lld%lld%lld", m, n, nonz);
 #else
     fscanf(fp, "%d%d%d", m, n, nonz);
 #endif
@@ -76,7 +76,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 
 #ifdef _LONGINT
-        fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+        fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #else // int 
         fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #endif
@@ -85,8 +85,9 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
-	    } else
+	    } else {
 		printf("triplet file: row/col indices are one-based.\n");
+     	    }
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
diff --git a/SRC/zreadtriple_noheader.c b/SRC/zreadtriple_noheader.c
index 3ddedb01..8410540b 100644
--- a/SRC/zreadtriple_noheader.c
+++ b/SRC/zreadtriple_noheader.c
@@ -48,7 +48,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     nz = *n = 0;
 
 #ifdef _LONGINT
-    ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+    ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #else  // int
     ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #endif
@@ -61,7 +61,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	++nz;
 
 #ifdef _LONGINT
-        ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+        ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #else  // int
         ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #endif
@@ -104,7 +104,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* Read into the triplet array from a file */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+	fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #else // int32
 	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #endif
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
index afcfc2d3..ffe4d25a 100644
--- a/SRC/zscatter.c
+++ b/SRC/zscatter.c
@@ -306,7 +306,7 @@ gemm_division_cpu_gpu(
 )
 {
     int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
@@ -433,7 +433,7 @@ gemm_division_new (int * num_streams_used,   /*number of streams that will be us
     )
 {
     int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
new file mode 100644
index 00000000..9070fdca
--- /dev/null
+++ b/SRC/zscatter3d.c
@@ -0,0 +1,624 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, doublecomplex *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+zblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    doublecomplex *L_mat, int ldl,
+                    doublecomplex *U_mat, int ldu,
+                    doublecomplex *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling ZGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        zscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+zblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         doublecomplex *L_mat, int_t ldl,
+                         doublecomplex *U_mat, int_t ldu,
+                         doublecomplex *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling ZGEMM */
+    superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        zscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 doublecomplex* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 zLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zgemm_scatterTopLeft */
+
+int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  zLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zblock_gemm_scatterTopRight */
+
+int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    zLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomLeft */
+
+int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     zLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           doublecomplex *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    doublecomplex *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                z_sub(&ucol[indirect[i]], &ucol[indirect[i]], &tempv[i]);
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/zsp_blas2_dist.c b/SRC/zsp_blas2_dist.c
index 4f8990eb..54a55bd3 100644
--- a/SRC/zsp_blas2_dist.c
+++ b/SRC/zsp_blas2_dist.c
@@ -426,9 +426,8 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A,
     }
 
     /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || 
-	z_eq(&alpha, &comp_zero) && 
-	z_eq(&beta, &comp_one))
+    if ( A->nrow == 0 || A->ncol == 0 || 
+	(z_eq(&alpha, &comp_zero) && z_eq(&beta, &comp_one)) )
 	return 0;
 
 
diff --git a/SRC/zsuperlu_blas.c b/SRC/zsuperlu_blas.c
new file mode 100644
index 00000000..2d413761
--- /dev/null
+++ b/SRC/zsuperlu_blas.c
@@ -0,0 +1,122 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_zdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_zgemm(const char *transa, const char *transb,
+                  int m, int n, int k, doublecomplex alpha, doublecomplex *a,
+                  int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return CGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    zgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return zgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_ztrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, doublecomplex *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    ztrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return ztrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_zger(const int m, const int n, const doublecomplex alpha,
+                 const doublecomplex *x, const int incx, const doublecomplex *y,
+                 const int incy, doublecomplex *a, const int lda)
+{
+#ifdef _CRAY
+    CGERU(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    zgeru_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx)
+{
+    zscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_zaxpy(const int n, const doublecomplex alpha,
+    const doublecomplex *x, const int incx, doublecomplex *y, const int incy)
+{
+    zaxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_zgemv(const char *trans, const int m,
+                  const int n, const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, const doublecomplex *x, const int incx,
+                  const doublecomplex beta, doublecomplex *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_ztrsv(char *uplo, char *trans, char *diag,
+                  int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    CTRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
new file mode 100644
index 00000000..4cddd772
--- /dev/null
+++ b/SRC/zsuperlu_gpu.cu
@@ -0,0 +1,1791 @@
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * </pre>
+ */
+
+//#define GPU_DEBUG
+
+#include "mpi.h"
+// #include "sec_structs.h"
+#include <ctime>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#undef Reduce
+
+//#include <thrust/system/gpu/detail/cub/cub.cuh>
+
+#include "zlustruct_gpu.h"
+
+#include "dcomplex.h"
+
+//extern "C" {
+//	void cblas_daxpy(const int N, const double alpha, const double *X,
+//	                 const int incX, double *Y, const int incY);
+//}
+
+// gpublasStatus_t checkGPUblas(gpublasStatus_t result)
+// {
+// #if defined(DEBUG) || defined(_DEBUG)
+// 	if (result != GPUBLAS_STATUS_SUCCESS)
+// 	{
+// 		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result));
+// 		assert(result == GPUBLAS_STATUS_SUCCESS);
+// 	}
+// #endif
+// 	return result;
+// }
+
+
+// #define UNIT_STRIDE
+
+#if 0  ////////// this routine is not used anymore
+__device__ inline
+void device_scatter_l (int_t thread_id,
+                       int_t nsupc, int_t temp_nbrow,
+                       int_t *usub, int_t iukp, int_t klst,
+                       doublecomplex *nzval, int_t ldv,
+                       doublecomplex *tempv, int_t nbrow,
+                       // int_t *indirect2_thread
+                       int *indirect2_thread
+                      )
+{
+
+
+	int_t segsize, jj;
+
+	for (jj = 0; jj < nsupc; ++jj)
+	{
+		segsize = klst - usub[iukp + jj];
+		if (segsize)
+		{
+			if (thread_id < temp_nbrow)
+			{
+
+#ifndef UNIT_STRIDE
+				nzval[indirect2_thread[thread_id]] -= tempv[thread_id];
+#else
+				nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/
+#endif
+			}
+			tempv += nbrow;
+		}
+		nzval += ldv;
+	}
+}
+#endif ///////////// not used
+
+//#define THREAD_BLOCK_SIZE  256  /* Sherry: was 192. should be <= MAX_SUPER_SIZE */
+
+__device__ inline
+void zdevice_scatter_l_2D (int thread_id,
+                          int nsupc, int temp_nbrow,
+                          int_t *usub, int iukp, int_t klst,
+                          doublecomplex *nzval, int ldv,
+                          const doublecomplex *tempv, int nbrow,
+                          int *indirect2_thread,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+    if ( thread_id < temp_nbrow * ColPerBlock )	{
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#define UNROLL_ITER 8
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+   	    i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x];
+	    z_sub(&nzval[i], &nzval[i], &tempv[nbrow * col + thread_id_x]);
+	}
+    }
+}
+
+/* Sherry: this routine is not used */
+#if 0 //////////////////////////////////////////////
+__global__
+void cub_scan_test(void)
+{
+	int thread_id = threadIdx.x;
+	typedef cub::BlockScan<int, MAX_SUPER_SIZE > BlockScan; /*1D int data type*/
+
+	__shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+	__shared__ int IndirectJ1[MAX_SUPER_SIZE];
+	__shared__ int IndirectJ2[MAX_SUPER_SIZE];
+
+	if (thread_id < MAX_SUPER_SIZE)
+	{
+		IndirectJ1[thread_id] = (thread_id + 1) % 2;
+	}
+
+	__syncthreads();
+	if (thread_id < MAX_SUPER_SIZE)
+		BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+	if (thread_id < MAX_SUPER_SIZE)
+		printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+}
+#endif  /////////////////////////////////// not used
+
+
+__device__ inline
+void device_scatter_u_2D (int thread_id,
+                          int temp_nbrow,  int nsupc,
+                          doublecomplex * ucol,
+                          int_t * usub, int iukp,
+                          int_t ilst, int_t klst,
+                          int_t * index, int iuip_lib,
+                          doublecomplex * tempv, int nbrow,
+                          int *indirect,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ1,
+                          int *IndirectJ3
+                         )
+{
+    int i;
+
+    if ( thread_id < temp_nbrow * ColPerBlock )
+    {    
+	/* 1D threads are logically arranged in 2D shape. */
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+           i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x];
+	   z_sub(&ucol[i], &ucol[i], &tempv[nbrow * col + thread_id_x]);
+	}
+    }
+}
+
+__global__
+void Scatter_GPU_kernel(
+    int_t streamId,
+    int_t ii_st, int_t ii_end,
+    int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */
+    int_t klst,
+    int_t jj0,   /* 0 on entry */
+    int_t nrows, int_t ldt, int_t npcol, int_t nprow,
+    zLUstruct_gpu_t * A_gpu)
+{
+
+	/* initializing pointers */
+	int_t *xsup = A_gpu->xsup;
+	int_t *UrowindPtr = A_gpu->UrowindPtr;
+	int_t *UrowindVec = A_gpu->UrowindVec;
+	int_t *UnzvalPtr = A_gpu->UnzvalPtr;
+	doublecomplex *UnzvalVec = A_gpu->UnzvalVec;
+	int_t *LrowindPtr = A_gpu->LrowindPtr;
+	int_t *LrowindVec = A_gpu->LrowindVec;
+	int_t *LnzvalPtr = A_gpu->LnzvalPtr;
+	doublecomplex *LnzvalVec = A_gpu->LnzvalVec;
+	doublecomplex *bigV = A_gpu->scubufs[streamId].bigV;
+	local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec;
+	local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec;
+	int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr;
+	int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info;
+	int_t *lsub  = A_gpu->scubufs[streamId].lsub;
+	int_t *usub  = A_gpu->scubufs[streamId].usub;
+
+	/* thread block assignment: this thread block is
+	   assigned to block (lb, j) in 2D grid */
+	int lb = blockIdx.x + ii_st;
+	int j  = blockIdx.y + jj_st;
+	
+	extern __shared__ int s[];
+	int* indirect_lptr = s;  /* row-wise */
+	int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */
+	int* IndirectJ1= (int*) &indirect2_thread[ldt];    /* column-wise */
+	int* IndirectJ3= (int*) &IndirectJ1[ldt];    /* column-wise */
+	//int THREAD_BLOCK_SIZE =ldt; 
+	
+	int* pfxStorage = (int*) &IndirectJ3[ldt];
+	
+	int thread_id = threadIdx.x;
+
+	int iukp = Ublock_info[j].iukp;
+	int jb = Ublock_info[j].jb;
+	int nsupc = SuperSize (jb);
+	int ljb = jb / npcol;
+
+	typedef int pfx_dtype ;
+        extern  __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n);
+
+	doublecomplex *tempv1;
+	if (jj_st == jj0)
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + Ublock_info[j - 1].full_u_cols * nrows;
+	}
+	else
+	{
+		tempv1 = (j == jj_st) ? bigV
+		         : bigV + (Ublock_info[j - 1].full_u_cols -
+		                   Ublock_info[jj_st - 1].full_u_cols) * nrows;
+	}
+
+	/* # of nonzero columns in block j  */
+	int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols
+	               : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols);
+	int cum_ncol = (j == 0) ? 0	
+					: Ublock_info[j - 1].full_u_cols;
+
+	int lptr = Remain_info[lb].lptr;
+	int ib   = Remain_info[lb].ib;
+	int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */
+	lptr += LB_DESCRIPTOR;
+
+	int_t cum_nrow;
+	if (ii_st == 0)
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+	}
+	else
+	{
+		cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow);
+	}
+
+	tempv1 += cum_nrow;
+
+	if (ib < jb)  /*scatter U code */
+	{
+		int ilst = FstBlockC (ib + 1);
+		int lib =  ib / nprow;   /* local index of row block ib */
+		int_t *index = &UrowindVec[UrowindPtr[lib]];
+
+		int num_u_blocks = index[0];
+
+		int ljb = (jb) / npcol; /* local index of column block jb */
+
+		/* Each thread is responsible for one block column */
+		__shared__ int ljb_ind;
+		/*do a search ljb_ind at local row lib*/
+		int blks_per_threads = CEILING(num_u_blocks, blockDim.x);
+		// printf("blockDim.x =%d \n", blockDim.x);
+		
+		for (int i = 0; i < blks_per_threads; ++i)
+			/* each thread is assigned a chunk of consecutive U blocks to search */
+		{
+			/* only one thread finds the block index matching ljb */
+			if (thread_id * blks_per_threads + i < num_u_blocks &&
+			        local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb)
+			{
+				ljb_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip;
+		int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip;
+		iuip_lib += UB_DESCRIPTOR;
+		doublecomplex *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]];
+		doublecomplex *ucol = &Unzval_lib[ruip_lib];
+
+		if (thread_id < temp_nbrow) /* row-wise */
+		{
+		    /* cyclically map each thread to a row */
+		    indirect_lptr[thread_id] = (int) lsub[lptr + thread_id];
+		}
+
+		/* column-wise: each thread is assigned one column */
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		/* indirectJ3[j] == kk means the j-th nonzero segment
+		   points to column kk in this supernode */
+
+		__syncthreads();
+
+		/* threads are divided into multiple columns */
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		// if (thread_id < blockDim.x)
+		// 	IndirectJ1[thread_id] = 0;
+		if (thread_id < ldt)
+			IndirectJ1[thread_id] = 0;
+
+		if (thread_id < blockDim.x)
+		{
+		    if (thread_id < nsupc)
+		    {
+			/* fstnz subscript of each column in the block */
+			IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst;
+		    }
+		}
+
+		/* perform an inclusive block-wide prefix sum among all threads */
+		__syncthreads();
+		
+		incScan(IndirectJ1, pfxStorage, nsupc);
+		
+		__syncthreads();
+
+		device_scatter_u_2D (
+		    thread_id,
+		    temp_nbrow,  nsupc,
+		    ucol,
+		    usub, iukp,
+		    ilst, klst,
+		    index, iuip_lib,
+		    tempv1, nrows,
+		    indirect_lptr,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ1,
+		    IndirectJ3 );
+
+	}
+	else     /* ib >= jb, scatter L code */
+	{
+
+		int rel;
+		doublecomplex *nzval;
+		int_t *index = &LrowindVec[LrowindPtr[ljb]];
+		int num_l_blocks = index[0];
+		int ldv = index[1];
+
+		int fnz = FstBlockC (ib);
+		int lib = ib / nprow;
+
+		__shared__ int lib_ind;
+		/*do a search lib_ind for lib*/
+		int blks_per_threads = CEILING(num_l_blocks, blockDim.x);
+		for (int i = 0; i < blks_per_threads; ++i)
+		{
+			if (thread_id * blks_per_threads + i < num_l_blocks &&
+			        local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib)
+			{
+				lib_ind = thread_id * blks_per_threads + i;
+			}
+		}
+		__syncthreads();
+
+		int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj;
+		int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj;
+		lptrj += LB_DESCRIPTOR;
+		int dest_nbrow = index[lptrj - 1];
+
+		if (thread_id < dest_nbrow)
+		{
+		    rel = index[lptrj + thread_id] - fnz;
+		    indirect_lptr[rel] = thread_id;
+		}
+		__syncthreads();
+
+		/* can be precalculated */
+		if (thread_id < temp_nbrow)
+		{
+			rel = lsub[lptr + thread_id] - fnz;
+			indirect2_thread[thread_id] = indirect_lptr[rel];
+		}
+		if (thread_id < nnz_cols)
+			IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+		__syncthreads();
+
+		int ColPerBlock = blockDim.x / temp_nbrow;
+
+		nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj;
+		zdevice_scatter_l_2D(
+		    thread_id,
+		    nsupc, temp_nbrow,
+		    usub, iukp, klst,
+		    nzval, ldv,
+		    tempv1, nrows, indirect2_thread,
+		    nnz_cols, ColPerBlock,
+		    IndirectJ3);
+	} /* end else ib >= jb */
+
+} /* end Scatter_GPU_kernel */
+
+
+#define GPU_2D_SCHUDT  /* Not used */
+
+int zSchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */
+    int_t nub,    /* jj_cpu on entry, pointing to the end of the Phi part */
+    int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,    /* num_u_blks_hi */
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    zsluGPU_t *sluGPU, gridinfo_t *grid
+)
+{
+    int SCATTER_THREAD_BLOCK_SIZE=512;
+
+	zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu;
+	zLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu;
+	int_t nprow = grid->nprow;
+	int_t npcol = grid->npcol;
+
+	gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId];
+	gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId];
+	int_t * lsub = A_gpu->scubufs[streamId].lsub_buf;
+	int_t * usub = A_gpu->scubufs[streamId].usub_buf;
+	Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host;
+	doublecomplex * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host;
+	Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host;
+	doublecomplex * bigU = A_gpu->scubufs[streamId].bigU_host;
+
+	A_gpu->isOffloaded[k0] = 1;
+	/* start by sending data to  */
+	int_t *xsup = A_gpu->xsup_host;
+	int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols;
+	// if(nub<1) return;
+	int_t ncols  = Ublock_info[nub - 1].full_u_cols - col_back;
+
+	/* Sherry: can get max_super_size from sp_ienv(3) */
+	int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment
+	int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far
+	int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the
+					 j-th nonzero segment points
+					 to column k in this supernode */
+	/* calculate usub_indirect */
+	for (int jj = jj_cpu; jj < nub; ++jj)
+	{
+	    int_t iukp = Ublock_info[jj].iukp;
+	    int_t jb = Ublock_info[jj].jb;
+	    int_t nsupc = SuperSize (jb);
+	    int_t addr = (jj == 0) ? 0
+	             : Ublock_info[jj - 1].full_u_cols - col_back;
+
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	indirectJ1[kk] = 0;
+	    }
+
+	    for (int_t kk = 0; kk < nsupc; ++kk)
+	    {
+	 	indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1;
+	    }
+
+	    /*prefix sum - indicates # of nonzero segments up to column kk */
+	    indirectJ2[0] = indirectJ1[0];
+	    for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	 	indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk];
+	    }
+
+	    /* total number of nonzero segments in this supernode */
+	    int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE
+
+	    /* compactation */
+	    for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE
+	    {
+	    	if (indirectJ1[kk]) /* kk is a nonzero segment */
+		{
+		    /* indirectJ3[j] == kk means the j-th nonzero segment
+		       points to column kk in this supernode */
+		    indirectJ3[indirectJ2[kk] - 1] = kk;
+		}
+	    }
+
+    	    for (int i = 0; i < nnz_col; ++i)
+	    {
+	        /* addr == total # of full columns before current block jj */
+		A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i];
+	    }
+	} /* end for jj ... calculate usub_indirect */
+
+	//printf("zSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout);
+
+	/*sizeof RemainLbuf = Rnbuf*knsupc */
+	double tTmp = SuperLU_timer_();
+	gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream);
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3,
+	                          A_gpu->scubufs[streamId].usub_IndirectJ3_host,
+	                          ncols * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff,
+	                          Remain_lbuf_send_size * sizeof(doublecomplex),
+	                          gpuMemcpyHostToDevice, FunCallStream)) ;
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU,
+	                          bigu_send_size * sizeof(doublecomplex),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info,
+	                          RemainBlk * sizeof(Remain_info_t),
+	                          gpuMemcpyHostToDevice, FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info,
+	                          mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub,
+	                          lsub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub,
+	                          usub_len * sizeof(int_t), gpuMemcpyHostToDevice,
+	                          FunCallStream) );
+
+	A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp;
+	A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(doublecomplex)
+	                   + bigu_send_size * sizeof(doublecomplex)
+	                   + RemainBlk * sizeof(Remain_info_t)
+	                   + mcb * sizeof(Ublock_info_t)
+	                   + lsub_len * sizeof(int_t)
+	                   + usub_len * sizeof(int_t);
+
+        doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+        /* The following are used in gpublasZgemm() call */
+        cuDoubleComplex *cu_alpha = (cuDoubleComplex*) &alpha;
+        cuDoubleComplex *cu_beta = (cuDoubleComplex*) &beta;
+        cuDoubleComplex *cu_A, *cu_B, *cu_C; /* C <- A*B */
+
+	int_t ii_st  = 0;
+	int_t ii_end = 0;
+	int_t maxGemmBlockDim = (int) sqrt(buffer_size);
+	// int_t maxGemmBlockDim = 8000;
+
+	/* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that
+	   fits in the buffer_size  */
+	while (ii_end < RemainBlk) {
+    	    ii_st = ii_end;
+	    ii_end = RemainBlk;
+	    int_t nrow_max = maxGemmBlockDim;
+// nrow_max = Rnbrow;
+	    int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow;
+	    nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows,  nrow_max) : nrow_max;
+
+	    int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow;
+	    for (int_t i = ii_st; i < RemainBlk - 1; ++i)
+    	    {
+		if ( Remain_info[i + 1].FullRow > ResRow + nrow_max)
+		{
+		    ii_end = i;
+		    break;  /* row dimension reaches nrow_max */
+		}
+	    }
+
+	    int_t nrows;   /* actual row dimension for GEMM */
+	    int_t st_row;
+	    if (ii_st > 0)
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow;
+		st_row = Remain_info[ii_st - 1].FullRow;
+	    }
+	    else
+	    {
+		nrows = Remain_info[ii_end - 1].FullRow;
+		st_row = 0;
+	    }
+
+	    int jj_st = jj_cpu;
+	    int jj_end = jj_cpu;
+
+	    while (jj_end < nub && nrows > 0 )
+	    {
+		int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols;
+		if ( remaining_cols * nrows < buffer_size)
+		{
+			jj_st = jj_end;
+			jj_end = nub;
+		}
+		else  /* C matrix cannot fit in buffer, need to break into pieces */
+		{
+		    int_t ncol_max = buffer_size / nrows;
+		    /** Must revisit **/
+		    ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim);
+		    ncol_max = (remaining_cols / ncol_max) > 0 ?
+		           remaining_cols / CEILING(remaining_cols,  ncol_max)
+		           : ncol_max;
+
+		    jj_st = jj_end;
+		    jj_end = nub;
+
+		    int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols;
+		    for (int_t j = jj_st; j < nub - 1; ++j)
+		    {
+			if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max)
+			{
+				jj_end = j;
+				break;
+			}
+		    }
+	    	} /* end-if-else */
+
+		int ncols;
+		int st_col;
+		if (jj_st > 0)
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols;
+		    st_col = Ublock_info[jj_st - 1].full_u_cols;
+		    if (ncols == 0) exit(0);
+		}
+		else
+		{
+		    ncols = Ublock_info[jj_end - 1].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* none of the matrix dimension is zero. */
+		if (nrows > 0 && ldu > 0 && ncols > 0)
+		{
+		    if (nrows * ncols > buffer_size) {
+			printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n",
+			       nrows, ncols, buffer_size);
+			fflush(stdout);
+		    }
+		    assert(nrows * ncols <= buffer_size);
+		    gpublasSetStream(gpublas_handle0, FunCallStream);
+		    gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream);
+		    cu_A = (cuDoubleComplex*) &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row];
+		    cu_B = (cuDoubleComplex*) &A_gpu->scubufs[streamId].bigU[st_col * ldu];
+		    cu_C = (cuDoubleComplex*) A_gpu->scubufs[streamId].bigV;
+		    gpublasZgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N,
+			            nrows, ncols, ldu, cu_alpha,
+			            cu_A, Rnbrow, cu_B, ldu, cu_beta,
+				    cu_C, nrows);
+
+// #define SCATTER_OPT
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchronous
+#endif
+		    gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream);
+
+		    A_gpu->GemmFLOPCounter += 8.0 * (double) nrows * ncols * ldu;
+
+		    /*
+		     * Scattering the output
+		     */
+		     // dim3 dimBlock(THREAD_BLOCK_SIZE);   // 1d thread
+		    dim3 dimBlock(ldt);   // 1d thread
+
+		    dim3 dimGrid(ii_end - ii_st, jj_end - jj_st);
+
+		    Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>>
+			(streamId, ii_st, ii_end,  jj_st, jj_end, klst,
+			 0, nrows, ldt, npcol, nprow, dA_gpu);
+#ifdef SCATTER_OPT
+		    gpuStreamSynchronize(FunCallStream);
+#warning this function is synchrnous
+#endif
+
+		    gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream);
+
+		    A_gpu->ScatterMOPCounter +=  3.0 * (double) nrows * ncols;
+		} /* endif ... none of the matrix dimension is zero. */
+
+	    } /* end while jj_end < nub */
+
+	} /* end while (ii_end < RemainBlk) */
+
+	return 0;
+} /* end zSchurCompUpdate_GPU */
+
+
+static void print_occupancy()
+{
+    int blockSize;   // The launch configurator returned block size
+    int minGridSize; /* The minimum grid size needed to achieve the
+    		        best potential occupancy  */
+
+    gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize,
+                                        Scatter_GPU_kernel, 0, 0);
+    printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize);
+}
+
+static void printDevProp(gpuDeviceProp devProp)
+{
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+	
+	printf("pciBusID:                      %d\n",  devProp.pciBusID);
+	printf("pciDeviceID:                   %d\n",  devProp.pciDeviceID);
+	printf("GPU Name:                      %s\n",  devProp.name);
+	printf("Total global memory:           %zu\n",  devProp.totalGlobalMem);
+	printf("Total free memory:             %zu\n",  mfree);
+	printf("Clock rate:                    %d\n",  devProp.clockRate);
+
+	return;
+}
+
+
+static size_t get_acc_memory ()
+{
+
+	size_t mfree, mtotal;
+	gpuMemGetInfo	(&mfree, &mtotal);
+#if 0
+	printf("Total memory %zu & free memory %zu\n", mtotal, mfree);
+#endif
+	return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu ();
+
+
+}
+
+int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
+{
+	/* Free the L data structure on GPU */
+	checkGPU(gpuFree(A_gpu->LrowindVec));
+	checkGPU(gpuFree(A_gpu->LrowindPtr));
+
+	checkGPU(gpuFree(A_gpu->LnzvalVec));
+	checkGPU(gpuFree(A_gpu->LnzvalPtr));
+	free(A_gpu->LnzvalPtr_host);
+	
+	/*freeing the pinned memory*/
+	int_t streamId = 0;
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host));
+	checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host));
+
+	checkGPU(gpuFreeHost(A_gpu->acc_L_buff));
+	checkGPU(gpuFreeHost(A_gpu->acc_U_buff));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf));
+	checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf));
+
+
+	SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE
+	SUPERLU_FREE(A_gpu->GemmStart);
+	SUPERLU_FREE(A_gpu->GemmEnd);
+	SUPERLU_FREE(A_gpu->ScatterEnd);
+	SUPERLU_FREE(A_gpu->ePCIeH2D);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_Start);
+	SUPERLU_FREE(A_gpu->ePCIeD2H_End);
+
+	/* Free the U data structure on GPU */
+	checkGPU(gpuFree(A_gpu->UrowindVec));
+	checkGPU(gpuFree(A_gpu->UrowindPtr));
+
+	//free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated
+
+	checkGPU(gpuFree(A_gpu->UnzvalVec));
+	checkGPU(gpuFree(A_gpu->UnzvalPtr));
+
+	checkGPU(gpuFree(A_gpu->grid));
+
+	/* Free the Schur complement structure on GPU */
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info));
+
+	// checkGPU(gpuFree(A_gpu->indirect));
+	// checkGPU(gpuFree(A_gpu->indirect2));
+	checkGPU(gpuFree(A_gpu->xsup));
+
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub));
+	checkGPU(gpuFree(A_gpu->scubufs[streamId].usub));
+
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->jib_lookupVec));
+	checkGPU(gpuFree(A_gpu->jib_lookupPtr));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
+	checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
+	checkGPU(gpuFree(A_gpu->ijb_lookupVec));
+	checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
+
+	return 0;
+}
+
+
+
+void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex * dA, int_t lda )
+{
+	doublecomplex *cPtr = (doublecomplex *) malloc(sizeof(doublecomplex) * lda * n);
+	checkGPU(gpuMemcpy( cPtr, dA,
+	                      lda * n * sizeof(doublecomplex), gpuMemcpyDeviceToHost)) ;
+
+	int_t i, j;
+	printf( "\n %s\n", desc );
+	for ( i = 0; i < m; i++ )
+	{
+		for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] );
+		printf( "\n" );
+	}
+	free(cPtr);
+}
+
+void zprintGPUStats(zLUstruct_gpu_t * A_gpu)
+{
+	double tGemm = 0;
+	double tScatter = 0;
+	double tPCIeH2D = 0;
+	double tPCIeD2H = 0;
+
+	for (int_t i = 0; i < A_gpu->nsupers; ++i)
+	{
+	    float milliseconds = 0;
+
+	    if (A_gpu->isOffloaded[i])
+		{
+			gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]);
+			tPCIeH2D += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]);
+			tGemm += 1e-3 * (double) milliseconds;
+			milliseconds = 0;
+			gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]);
+			tScatter += 1e-3 * (double) milliseconds;
+		}
+
+		milliseconds = 0;
+		gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]);
+		tPCIeD2H += 1e-3 * (double) milliseconds;
+	}
+
+	printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n",
+	       A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm  );
+	printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n",
+	       A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter  );
+	printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D  );
+	printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	       1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H  );
+	fflush(stdout);
+
+} /* end printGPUStats */
+
+/* Initialize the GPU side of the data structure. */
+int zinitSluGPU3D_t(
+    zsluGPU_t *sluGPU, // LU structures on GPU, see zlustruct_gpu.h 
+    zLUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno,
+    int_t n,
+    int_t buffer_size,    /* read from env variable MAX_BUFFER_SIZE */
+    int_t bigu_size,
+    int_t ldt             /* NSUP read from sp_ienv(3) */
+)
+{
+    checkGPUErrors(gpuDeviceReset ())     ;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int* isNodeInMyGrid = sluGPU->isNodeInMyGrid;
+
+    sluGPU->nGPUStreams = getnGPUStreams();
+    
+    int SCATTER_THREAD_BLOCK_SIZE = ldt; 
+    if(getenv("SCATTER_THREAD_BLOCK_SIZE"))
+    {
+	int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE"));
+	if(stbs>=ldt)
+	{
+	    SCATTER_THREAD_BLOCK_SIZE = stbs; 
+	}
+	
+    }
+    
+    if (grid3d->iam == 0)
+    {
+	printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams);
+	fflush(stdout);
+	printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE);
+	
+	if ( MAX_SUPER_SIZE < ldt )
+	{
+		ABORT("MAX_SUPER_SIZE smaller than requested NSUP");
+	}
+    }
+
+    gpuStreamCreate(&(sluGPU->CopyStream));
+
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+	gpuStreamCreate(&(sluGPU->funCallStreams[streamId]));
+	gpublasCreate(&(sluGPU->gpublasHandles[streamId]));
+	sluGPU->lastOffloadStream[streamId] = -1;
+    }
+
+    sluGPU->A_gpu = (zLUstruct_gpu_t *) malloc (sizeof(zLUstruct_gpu_t));
+    sluGPU->A_gpu->perm_c_supno = perm_c_supno;
+
+    /* Allocate GPU memory for the LU data structures, and copy
+       the host LU structure to GPU side.  */
+    zCopyLUToGPU3D ( isNodeInMyGrid,
+	        Llu,             /* referred to as A_host */
+	        sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt
+	);
+
+    return 0;
+} /* end zinitSluGPU3D_t */
+
+
+int zinitD2Hreduce(
+    int next_k,  d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP,
+    zsluGPU_t *sluGPU, gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t* SCT
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+
+
+    // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1);
+    // int_t next_k = perm_c_supno[next_col];  /* global block number for next colum*/
+    int_t mkcol, mkrow;
+    
+    int_t kljb = LBj( next_k, grid );   /*local block number for next block*/
+    int_t kijb = LBi( next_k, grid );   /*local block number for next block*/
+    
+    int_t *kindexL ;                     /*for storing index vectors*/
+    int_t *kindexU ;
+    mkrow = PROW (next_k, grid);
+    mkcol = PCOL (next_k, grid);
+    int_t ksup_size = SuperSize(next_k);
+    
+    int_t copyL_kljb = 0;
+    int_t copyU_kljb = 0;
+    int_t l_copy_len = 0;
+    int_t u_copy_len = 0;
+    
+    if (mkcol == mycol &&  Lrowind_bc_ptr[kljb] != NULL  && last_flag)
+    {
+	if (HyP->Lblock_dirty_bit[kljb] > -1)
+	    {
+		copyL_kljb = 1;
+		int_t lastk0 = HyP->Lblock_dirty_bit[kljb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+
+	kindexL = Lrowind_bc_ptr[kljb];
+	l_copy_len = kindexL[1] * ksup_size;
+    }
+
+    if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL    && last_flag )
+    {
+	if (HyP->Ublock_dirty_bit[kijb] > -1)
+	    {
+		copyU_kljb = 1;
+		int_t lastk0 = HyP->Ublock_dirty_bit[kijb];
+		int_t streamIdk0Offload =  lastk0 % sluGPU->nGPUStreams;
+		if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+		    {
+			// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+			double ttx = SuperLU_timer_();
+			gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+		    }
+	    }
+	// copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0;
+	kindexU = Ufstnz_br_ptr[kijb];
+	u_copy_len = kindexU[1];
+    }
+
+    // wait for streams if they have not been finished
+    
+    // d2Hred->next_col = next_col;
+    d2Hred->next_k = next_k;
+    d2Hred->kljb = kljb;
+    d2Hred->kijb = kijb;
+    d2Hred->copyL_kljb = copyL_kljb;
+    d2Hred->copyU_kljb = copyU_kljb;
+    d2Hred->l_copy_len = l_copy_len;
+    d2Hred->u_copy_len = u_copy_len;
+    d2Hred->kindexU = kindexU;
+    d2Hred->kindexL = kindexL;
+    d2Hred->mkrow = mkrow;
+    d2Hred->mkcol = mkcol;
+    d2Hred->ksup_size = ksup_size;
+    return 0;
+} /* zinitD2Hreduce */
+
+int zreduceGPUlu(
+    int last_flag,
+    d2Hreduce_t* d2Hred,
+    zsluGPU_t *sluGPU,
+    SCT_t *SCT,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct
+)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    gpuStream_t CopyStream;
+    zLUstruct_gpu_t *A_gpu;
+    A_gpu = sluGPU->A_gpu;
+    CopyStream = sluGPU->CopyStream;
+
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t mkrow = d2Hred->mkrow;
+    int_t mkcol = d2Hred->mkcol;
+    int_t ksup_size = d2Hred->ksup_size;
+    int_t *kindex;
+    if ((copyL_kljb || copyU_kljb) && last_flag )
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(CopyStream);
+	    SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx;
+	}
+
+    double tt_start = SuperLU_timer_();
+
+    if (last_flag) {
+	if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL )
+	    {
+		kindex = Lrowind_bc_ptr[kljb];
+		int_t len = kindex[1];
+
+		if (copyL_kljb)
+		    {
+			doublecomplex *nzval_host;
+			nzval_host = Lnzval_bc_ptr[kljb];
+			int_t llen = ksup_size * len;
+	                doublecomplex alpha = {1.0, 0.0};
+			superlu_zaxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1);
+		    }
+
+	    }
+    }
+    if (last_flag) {
+	if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL )
+	    {
+		kindex = Ufstnz_br_ptr[kijb];
+		int_t len = kindex[1];
+
+		if (copyU_kljb)
+		    {
+			doublecomplex *nzval_host;
+			nzval_host = Unzval_br_ptr[kijb];
+
+	                doublecomplex alpha = {1.0, 0.0};
+			superlu_zaxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1);
+		    }
+	    }
+    }
+
+    double tt_end = SuperLU_timer_();
+    SCT->AssemblyTimer += tt_end - tt_start;
+    return 0;
+} /* zreduceGPUlu */
+
+
+int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT)
+{
+    double ttx = SuperLU_timer_();
+    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+    return 0;
+}
+
+int zsendLUpanelGPU2HOST(
+    int_t k0,
+    d2Hreduce_t* d2Hred,
+    zsluGPU_t *sluGPU
+)
+{
+    int_t kljb = d2Hred->kljb;
+    int_t kijb = d2Hred->kijb;
+    int_t copyL_kljb = d2Hred->copyL_kljb;
+    int_t copyU_kljb = d2Hred->copyU_kljb;
+    int_t l_copy_len = d2Hred->l_copy_len;
+    int_t u_copy_len = d2Hred->u_copy_len;
+    gpuStream_t CopyStream = sluGPU->CopyStream;;
+    zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    double tty = SuperLU_timer_();
+    gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream);
+    if (copyL_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]],
+				  l_copy_len * sizeof(doublecomplex), gpuMemcpyDeviceToHost, CopyStream ) );
+
+    if (copyU_kljb)
+	checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]],
+				  u_copy_len * sizeof(doublecomplex), gpuMemcpyDeviceToHost, CopyStream ) );
+    gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream);
+    A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty;
+    A_gpu->cPCIeD2H += u_copy_len * sizeof(doublecomplex) + l_copy_len * sizeof(doublecomplex);
+
+    return 0;
+}
+
+/* Copy L and U panel data structures from host to the host part of the
+   data structures in A_gpu.
+   GPU is not involved in this routine. */
+int zsendSCUdataHost2GPU(
+    int_t streamId,
+    int_t* lsub,
+    int_t* usub,
+    doublecomplex* bigU,
+    int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,
+    zsluGPU_t *sluGPU,
+    HyP_t* HyP
+)
+{
+    //{printf("....[enter] zsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+    int_t usub_len = usub[2];
+    int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+    //{printf("....[2] in zsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);}
+    zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu;
+    memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len);
+    memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len);
+    memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info,
+	   sizeof(Remain_info_t)*HyP->RemainBlk);
+    memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi,
+	   sizeof(Ublock_info_t)*HyP->num_u_blks_Phi);
+    memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff,
+	   sizeof(doublecomplex)*Remain_lbuf_send_size);
+    memcpy(A_gpu->scubufs[streamId].bigU_host, bigU,
+	   sizeof(doublecomplex)*bigu_send_size);
+
+    return 0;
+}
+
+/* Sherry: not used ?*/
+#if 0
+int freeSluGPU(zsluGPU_t *sluGPU)
+{
+    return 0;
+}
+#endif
+
+/* Allocate GPU memory for the LU data structures, and copy
+   the host LU structure to GPU side.
+   After factorization, the GPU LU structure should be freed by
+   calling zfree_LUsstruct_gpu().    */
+void zCopyLUToGPU3D (
+    int* isNodeInMyGrid,
+    zLocalLU_t *A_host, /* distributed LU structure on host */
+    zsluGPU_t *sluGPU,  /* hold LU structure on GPU */
+    Glu_persist_t *Glu_persist, int_t n,
+    gridinfo3d_t *grid3d,
+    int_t buffer_size, /* bigV size on GPU for Schur complement update */
+    int_t bigu_size,
+    int_t ldt
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    zLUstruct_gpu_t * A_gpu =  sluGPU->A_gpu;
+    zLUstruct_gpu_t **dA_gpu =  &(sluGPU->dA_gpu);
+
+#if ( PRNTlevel>=1 )
+    if ( grid3d->iam == 0 ) print_occupancy();
+#endif
+
+#ifdef GPU_DEBUG
+    // if ( grid3d->iam == 0 )
+    {
+	gpuDeviceProp devProp;
+	gpuGetDeviceProperties(&devProp, 0);
+	printDevProp(devProp);
+    }
+#endif
+    int_t *xsup ;
+    xsup = Glu_persist->xsup;
+    int iam = grid->iam;
+    int nsupers = Glu_persist->supno[n - 1] + 1;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mcb =    (nsupers + Pc - 1) / Pc;
+    int_t remain_l_max = A_host->bufmax[1];
+
+    /*copies of scalars for easy access*/
+    A_gpu->nsupers = nsupers;
+    A_gpu->ScatterMOPCounter = 0;
+    A_gpu->GemmFLOPCounter = 0;
+    A_gpu->cPCIeH2D = 0;
+    A_gpu->cPCIeD2H = 0;
+    A_gpu->tHost_PCIeH2D = 0;
+    A_gpu->tHost_PCIeD2H = 0;
+
+    /*initializing memory*/
+    size_t max_gpu_memory = get_acc_memory ();
+    size_t gpu_mem_used = 0;
+
+    void *tmp_ptr;
+
+    A_gpu->xsup_host = xsup;
+
+    int_t nGPUStreams = sluGPU->nGPUStreams;
+    /*pinned memory allocations.
+      Paged-locked memory by gpuMallocHost is accessible to the device.*/
+    for (int streamId = 0; streamId < nGPUStreams; streamId++ ) {
+	void *tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, (n) * sizeof(int_t) )) ;
+	A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  ( n) * sizeof(int_t) ));
+	A_gpu->scubufs[streamId].usub_IndirectJ3 =  (int_t*) tmp_ptr;
+	gpu_mem_used += ( n) * sizeof(int_t);
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  remain_l_max * sizeof(doublecomplex) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff_host = (doublecomplex *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost(  &tmp_ptr,  bigu_size * sizeof(doublecomplex) )) ;
+	A_gpu->scubufs[streamId].bigU_host = (doublecomplex *) tmp_ptr;
+
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[1])));
+	A_gpu->acc_L_buff = (doublecomplex *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[3])));
+	A_gpu->acc_U_buff = (doublecomplex *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])));
+	A_gpu->scubufs[streamId].lsub_buf =  (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])));
+	A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  remain_l_max * sizeof(doublecomplex) )) ;
+	A_gpu->scubufs[streamId].Remain_L_buff = (doublecomplex *) tmp_ptr;
+	gpu_mem_used += remain_l_max * sizeof(doublecomplex);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  bigu_size * sizeof(doublecomplex) )) ;
+	A_gpu->scubufs[streamId].bigU = (doublecomplex *) tmp_ptr;
+	gpu_mem_used += bigu_size * sizeof(doublecomplex);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mcb * sizeof(Ublock_info_t) )) ;
+	A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr;
+	gpu_mem_used += mcb * sizeof(Ublock_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  mrb * sizeof(Remain_info_t) )) ;
+	A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr;
+	gpu_mem_used += mrb * sizeof(Remain_info_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  buffer_size * sizeof(doublecomplex))) ;
+	A_gpu->scubufs[streamId].bigV = (doublecomplex *) tmp_ptr;
+	gpu_mem_used += buffer_size * sizeof(doublecomplex);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[0]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[0] * sizeof(int_t);
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  A_host->bufmax[2]*sizeof(int_t))) ;
+	A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr;
+	gpu_mem_used += A_host->bufmax[2] * sizeof(int_t);
+	
+    } /* endfor streamID ... allocate paged-locked memory */
+
+    A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers);
+    A_gpu->GemmStart  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->GemmEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ScatterEnd  = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers);
+    
+    for (int i = 0; i < nsupers; ++i)
+	{
+	    A_gpu->isOffloaded[i] = 0;
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i])));
+	    checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i])));
+	}
+
+    /*---- Copy L data structure to GPU ----*/
+
+    /*pointers and address of local blocks for easy accessibility */
+    local_l_blk_info_t  *local_l_blk_infoVec;
+    int_t  * local_l_blk_infoPtr;
+    local_l_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) );
+
+    /* First pass: count total L blocks */
+    int_t cum_num_l_blocks = 0;  /* total number of L blocks I own */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    /* going through each block column I own */
+
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0];
+		    cum_num_l_blocks += num_l_blocks;
+		}
+	}
+
+    /*allocating memory*/
+    local_l_blk_infoVec =  (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t));
+
+    /* Second pass: set up the meta-data for the L structure */
+    cum_num_l_blocks = 0;
+
+    /*initialzing vectors */
+    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+		{
+		    int_t *index = A_host->Lrowind_bc_ptr[i];
+		    int_t num_l_blocks = index[0]; /* # L blocks in this column */
+
+		    if (num_l_blocks > 0)
+			{
+
+			    local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks;
+			    local_l_blk_infoPtr[i] = cum_num_l_blocks;
+
+			    int_t lptrj = BC_HEADER;
+			    int_t luptrj = 0;
+
+			    for (int_t j = 0; j < num_l_blocks ; ++j)
+				{
+
+				    int_t ijb = index[lptrj];
+
+				    local_l_blk_info_i[j].lib = ijb / Pr;
+				    local_l_blk_info_i[j].lptrj = lptrj;
+				    local_l_blk_info_i[j].luptrj = luptrj;
+				    luptrj += index[lptrj + 1];
+				    lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+					
+				}
+			}
+		    cum_num_l_blocks += num_l_blocks;
+		}
+
+	} /* endfor all block columns */
+
+    /* Allocate L memory on GPU, and copy the values from CPU to GPU */
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  cum_num_l_blocks * sizeof(local_l_blk_info_t))) ;
+    A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr;
+    gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+    checkGPUErrors(gpuMalloc(  &tmp_ptr,  CEILING(nsupers, Pc)*sizeof(int_t))) ;
+    A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr;
+    gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t);
+    checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+    /*---- Copy U data structure to GPU ----*/
+
+    local_u_blk_info_t  *local_u_blk_infoVec;
+    int_t  * local_u_blk_infoPtr;
+    local_u_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) );
+
+    /* First pass: count total U blocks */
+    int_t cum_num_u_blocks = 0;
+
+    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+		    cum_num_u_blocks += num_u_blocks;
+
+		}
+	}
+
+	local_u_blk_infoVec =  (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t));
+
+	/* Second pass: set up the meta-data for the U structure */
+	cum_num_u_blocks = 0;
+
+	for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+		{
+		    int_t *index = A_host->Ufstnz_br_ptr[i];
+		    int_t num_u_blocks = index[0];
+
+		    if (num_u_blocks > 0)
+			{
+			    local_u_blk_info_t  *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks;
+			    local_u_blk_infoPtr[i] = cum_num_u_blocks;
+
+			    int_t iuip_lib, ruip_lib;
+			    iuip_lib = BR_HEADER;
+			    ruip_lib = 0;
+
+			    for (int_t j = 0; j < num_u_blocks ; ++j)
+				{
+
+				    int_t ijb = index[iuip_lib];
+				    local_u_blk_info_i[j].ljb = ijb / Pc;
+				    local_u_blk_info_i[j].iuip = iuip_lib;
+				    local_u_blk_info_i[j].ruip = ruip_lib;
+
+				    ruip_lib += index[iuip_lib + 1];
+				    iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+
+				}
+			}
+		    cum_num_u_blocks +=  num_u_blocks;
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  cum_num_u_blocks * sizeof(local_u_blk_info_t))) ;
+	A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr;
+	gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  CEILING(nsupers, Pr)*sizeof(int_t))) ;
+	A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr;
+	gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t);
+	checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	/* Copy the actual L indices and values */
+	int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */
+	int_t *temp_LrowindPtr    = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *temp_LnzvalPtr     = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t l_ind_len = 0;
+	int_t l_val_len = 0;
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    temp_LrowindPtr[ljb] = l_ind_len;
+		    temp_LnzvalPtr[ljb] = l_val_len;        // ###
+		    Lnzval_size[ljb] = 0;       //###
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  = index_host[0];   /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+			    /* Global block number is mycol +  ljb*Pc */
+			    int_t nsupc = SuperSize(jb);
+
+			    l_ind_len += len1;
+			    l_val_len += len * nsupc;
+			    Lnzval_size[ljb] = len * nsupc ; // ###
+			}
+		    else
+			{
+			    Lnzval_size[ljb] = 0 ; // ###
+			}
+		}
+	} /* endfor jb = 0 ... */
+
+	/* Copy the actual U indices and values */
+	int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	int_t *temp_UrowindPtr    = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *temp_UnzvalPtr     = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t u_ind_len = 0;
+	int_t u_val_len = 0;
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+	    temp_UrowindPtr[lb] = u_ind_len;
+	    temp_UnzvalPtr[lb] = u_val_len;
+	    Unzval_size[lb] = 0;
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len = index_host[1];
+		    int_t len1 = index_host[2];
+		    
+		    u_ind_len += len1;
+		    u_val_len += len;
+		    Unzval_size[lb] = len;
+		}
+	    else
+		{
+		    Unzval_size[lb] = 0;
+		}
+	}
+
+	gpu_mem_used += l_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * l_k * sizeof(int_t);
+	gpu_mem_used += u_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * u_k * sizeof(int_t);
+
+	/*left memory shall be divided among the two */
+
+	for (int_t i = 0;  i < l_k; ++i)
+	{
+	    temp_LnzvalPtr[i] = -1;
+	}
+
+	for (int_t i = 0; i < u_k; ++i)
+	{
+	    temp_UnzvalPtr[i] = -1;
+	}
+
+	/*setting these pointers back */
+	l_val_len = 0;
+	u_val_len = 0;
+
+	int_t num_gpu_l_blocks = 0;
+	int_t num_gpu_u_blocks = 0;
+	size_t mem_l_block, mem_u_block;
+
+	/* Find the trailing matrix size that can fit into GPU memory */
+	for (int_t i = nsupers - 1; i > -1; --i)
+	{
+	    /* ulte se chalte hai eleimination tree  */
+	    /* bottom up ordering  */
+	    int_t i_sup = A_gpu->perm_c_supno[i];
+
+	    int_t pc = PCOL( i_sup, grid );
+	    if (isNodeInMyGrid[i_sup] == 1)
+		{
+		    if (mycol == pc )
+			{
+			    int_t ljb  = LBj(i_sup, grid);
+			    mem_l_block = sizeof(doublecomplex) * Lnzval_size[ljb];
+			    if (gpu_mem_used + mem_l_block > max_gpu_memory)
+				{
+				    break;
+				}
+				else
+				{
+				    gpu_mem_used += mem_l_block;
+				    temp_LnzvalPtr[ljb] = l_val_len;
+				    l_val_len += Lnzval_size[ljb];
+				    num_gpu_l_blocks++;
+				    A_gpu->first_l_block_gpu = i;
+				}
+			}
+
+			int_t pr = PROW( i_sup, grid );
+			if (myrow == pr)
+			{
+			    int_t lib  = LBi(i_sup, grid);
+			    mem_u_block = sizeof(doublecomplex) * Unzval_size[lib];
+			    if (gpu_mem_used + mem_u_block > max_gpu_memory)
+				{
+				    break;
+				}
+			    else
+				{
+				    gpu_mem_used += mem_u_block;
+				    temp_UnzvalPtr[lib] = u_val_len;
+				    u_val_len += Unzval_size[lib];
+				    num_gpu_u_blocks++;
+				    A_gpu->first_u_block_gpu = i;
+				}
+			}
+		} /* endif */
+
+	} /* endfor i .... nsupers */
+
+#if (PRNTlevel>=2)
+	printf("(%d) Number of L blocks in GPU %d, U blocks %d\n",
+	       grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks );
+	printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n",
+	       grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu);
+	printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam,
+	       l_val_len * sizeof(doublecomplex) * 1e-9,
+	       u_val_len * sizeof(doublecomplex) * 1e-9,
+	       gpu_mem_used * 1e-9, max_gpu_memory * 1e-9);
+	fflush(stdout);
+#endif
+
+	/* Assemble index vector on temp */
+	int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len);
+	for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *index_host;
+		    index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		    if (index_host != NULL)
+			{
+			    int_t nrbl  =   index_host[0]; /* number of L blocks */
+			    int_t len   = index_host[1];   /* LDA of the nzval[] */
+			    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    
+			    memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ;
+			}
+		}
+	}
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  l_ind_len * sizeof(int_t))) ;
+	A_gpu->LrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_val_len * sizeof(doublecomplex)));
+	A_gpu->LnzvalVec = (doublecomplex *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(doublecomplex)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->LnzvalPtr_host = temp_LnzvalPtr;
+
+	int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len);
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+		    int_t len1 = index_host[2];
+		    memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1);
+		}
+	}
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_ind_len * sizeof(int_t))) ;
+	A_gpu->UrowindVec = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_val_len * sizeof(doublecomplex)));
+	A_gpu->UnzvalVec = (doublecomplex *) tmp_ptr;
+	checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(doublecomplex)));
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UrowindPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	A_gpu->UnzvalPtr_host = temp_UnzvalPtr;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UnzvalPtr = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc(  &tmp_ptr,  (nsupers + 1)*sizeof(int_t))) ;
+	A_gpu->xsup = (int_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ;
+
+	checkGPUErrors(gpuMalloc( &tmp_ptr,  sizeof(zLUstruct_gpu_t))) ;
+	*dA_gpu = (zLUstruct_gpu_t *) tmp_ptr;
+	checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(zLUstruct_gpu_t), gpuMemcpyHostToDevice)) ;
+
+	free (temp_LrowindPtr);
+	free (temp_UrowindPtr);
+	free (indtemp1);
+	free (indtemp);
+
+} /* end zCopyLUToGPU3D */
+
+
+
+int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+				int_t** treePerm,
+				zLUValSubBuf_t*LUvsb,
+				   zLUstruct_t* LUstruct,
+			           gridinfo3d_t* grid3d,
+				   zsluGPU_t *sluGPU,
+				   d2Hreduce_t* d2Hred,
+				   factStat_t *factStat,
+				   HyP_t* HyP, SCT_t* SCT )
+{
+    // first synchronize all gpu streams
+    int superlu_acc_offload =   HyP->superlu_acc_offload;
+
+    int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t* gpuLUreduced = factStat->gpuLUreduced;
+
+    int_t sender;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    
+	}
+    else
+	{
+	    sender = myGrid;
+	}
+
+    /*Reduce all the ancestors from the GPU*/
+    if (myGrid == sender && superlu_acc_offload)
+    {
+        for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+	{
+	    double ttx = SuperLU_timer_();
+	    gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+	    SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+	    sluGPU->lastOffloadStream[streamId] = -1;
+	}
+
+	for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+
+	    for (int_t node = 0; node < nsAncestor; node++ )
+	    {
+	        int_t k = cAncestorList[node];
+	        if (!gpuLUreduced[k])
+		{
+		    zinitD2Hreduce(k, d2Hred, 1,
+				  HyP, sluGPU, grid, LUstruct, SCT);
+		    int_t copyL_kljb = d2Hred->copyL_kljb;
+		    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+		    double tt_start1 = SuperLU_timer_();
+		    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+		    if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+		    zsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+		    /*
+		      Reduce the LU panels from GPU
+		    */
+		    zreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct);
+		    gpuLUreduced[k] = 1;
+		}
+	    }
+	}
+    } /*if (myGrid == sender)*/
+
+    zreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+	                      LUvsb, LUstruct, grid3d, SCT );
+    return 0;
+} /* zreduceAllAncestors3d_GPU */
+
+
+void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT)
+{
+    for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++)
+    {
+        double ttx = SuperLU_timer_();
+        gpuStreamSynchronize(sluGPU->funCallStreams[streamId]);
+        SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+        sluGPU->lastOffloadStream[streamId] = -1;
+     }
+}
diff --git a/SRC/zsuperlu_gpu.hip.cpp b/SRC/zsuperlu_gpu.hip.cpp
new file mode 100644
index 00000000..0d7a6dee
--- /dev/null
+++ b/SRC/zsuperlu_gpu.hip.cpp
@@ -0,0 +1 @@
+#include "zsuperlu_gpu.cu"
\ No newline at end of file
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
new file mode 100644
index 00000000..517a176e
--- /dev/null
+++ b/SRC/ztreeFactorization.c
@@ -0,0 +1,762 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_zdefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t zLluBufInit(zLUValSubBuf_t* LUvsb, zLUstruct_t *LUstruct)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    zdiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (zdiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(zdiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (zdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(zdiagFactBufs_t));
+        assert(dFBufs[i]);
+        zinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct)
+{
+    zLUValSubBuf_t** LUvsbs = (zLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(zLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (zLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
+        zLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  zscuBufs_t* scuBufs,
+                  zLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = zgetBigV(ldt, num_threads);
+    scuBufs->bigU = zgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int zfreeScuBufs(zscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t zdenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    zscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    zLUValSubBuf_t* LUvsb,
+    zdiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	zDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	zIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	zUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	zIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	zWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	zWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= zSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            doublecomplex* bigV = scuBufs->bigV;
+            doublecomplex* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                doublecomplex** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                doublecomplex *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                doublecomplex *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                zblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* zdenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t zsparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    zscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    zdiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zsparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = zSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            doublecomplex* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    zLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    zUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit zsparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* zsparseTreeFactor_ASYNC */
diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c
new file mode 100644
index 00000000..77044ba3
--- /dev/null
+++ b/SRC/ztreeFactorizationGPU.c
@@ -0,0 +1,758 @@
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * </pre>
+ */
+// #include "treeFactorization.h"
+// #include "trfCommWrapper.h"
+#include "zlustruct_gpu.h"
+
+//#include "cblas.h"
+
+#ifdef GPU_ACC ///////////////// enable GPU
+
+/* 
+/-- num_u_blks--\ /-- num_u_blks_Phi --\
+----------------------------------------
+|  host_cols    ||    GPU   |   host   |
+----------------------------------------
+                  ^          ^
+                  0          jj_cpu
+*/
+#if 0
+static int_t getAccUPartition(HyP_t *HyP)
+{
+    /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
+    int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                        estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu);
+
+    int jj_cpu;
+
+#if 0 /* Ignoe those estimates */
+    jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
+                                   HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
+#else /* Sherry: new */
+    jj_cpu = HyP->num_u_blks_Phi;
+#endif
+
+    if (jj_cpu != 0 && HyP->Rnbrow > 0) // ###
+    {
+        HyP->offloadCondition = 1;
+    }
+    else
+    {
+        HyP->offloadCondition = 0;
+        jj_cpu = 0; // ###
+    }
+
+    return jj_cpu;
+}
+#endif
+
+int zsparseTreeFactor_ASYNC_GPU(
+    sForest_t *sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    zscuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    zLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    zdiagFactBufs_t **dFBufs, // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
+    int ldt,
+    zsluGPU_t *sluGPU,
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info)
+{
+    // sforest.nNodes, sforest.nodeList,
+    // &sforest.topoInfo,
+    int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
+    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
+    int_t *myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t *grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t *IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t *factored_D = factStat->factored_D;
+    int_t *factored_L = factStat->factored_L;
+    int_t *factored_U = factStat->factored_U;
+    int_t *IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t *IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t *gpuLUreduced = factStat->gpuLUreduced;
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+
+    // int_t numLAMax = getNumLookAhead();
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax; // number of look-ahead panels
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
+    int_t last_flag = 1;                       /* for updating nsuper-1 only once */
+    int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams
+
+    if (superlu_acc_offload)
+        zsyncAllfunCallStreams(sluGPU, SCT);
+
+    /* Go through each leaf node */
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+
+        /* If LU panels from GPU are not reduced, then reduce
+	   them before diagonal factorization */
+        if (!gpuLUreduced[k] && superlu_acc_offload)
+        {
+            double tt_start1 = SuperLU_timer_();
+
+            zinitD2Hreduce(k, d2Hred, last_flag,
+                          HyP, sluGPU, grid, LUstruct, SCT);
+            int_t copyL_kljb = d2Hred->copyL_kljb;
+            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+            if (copyL_kljb || copyU_kljb)
+                SCT->PhiMemCpyCounter++;
+            zsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+
+            zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct);
+
+            gpuLUreduced[k] = 1;
+            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+        }
+
+        double t1 = SuperLU_timer_();
+
+        /*Now factor and broadcast diagonal block*/
+        // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+        //                 options, thresh, LUstruct, stat, info, SCT);
+
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+        zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                        factStat->IrecvPlcd_D,
+                        comReqss[offset]->U_diag_blk_recv_req,
+                        comReqss[offset]->L_diag_blk_recv_req,
+                        comReqss[offset]->U_diag_blk_send_req,
+                        comReqss[offset]->L_diag_blk_send_req,
+                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+
+        SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+    } /* for all leaves ... */
+
+    //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
+
+    /* Process supernodal etree level by level */
+    for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    {
+        //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        /* code */
+        int k_st = eTreeTopLims[topoLvl];
+        int k_end = eTreeTopLims[topoLvl + 1];
+
+        /* Process all the nodes in 'topoLvl': diagonal factorization */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            if (!factored_D[k])
+            {
+                /*If LU panels from GPU are not reduced then reduce
+		  them before diagonal factorization*/
+                if (!gpuLUreduced[k] && superlu_acc_offload)
+                {
+                    double tt_start1 = SuperLU_timer_();
+                    zinitD2Hreduce(k, d2Hred, last_flag,
+                                     HyP, sluGPU, grid, LUstruct, SCT);
+                    int_t copyL_kljb = d2Hred->copyL_kljb;
+                    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                    if (copyL_kljb || copyU_kljb)
+                        SCT->PhiMemCpyCounter++;
+                    zsendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+                    /*
+                        Reduce the LU panels from GPU
+                    */
+                    zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid,
+		    		     LUstruct);
+
+                    gpuLUreduced[k] = 1;
+                    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                }
+
+                double t1 = SuperLU_timer_();
+                /* Factor diagonal block on CPU */
+                // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                //                 options, thresh, LUstruct, stat, info, SCT);
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+                zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                                factStat->IrecvPlcd_D,
+                                comReqss[offset]->U_diag_blk_recv_req,
+                                comReqss[offset]->L_diag_blk_recv_req,
+                                comReqss[offset]->U_diag_blk_send_req,
+                                comReqss[offset]->L_diag_blk_send_req,
+                                grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+            }
+        } /* for all nodes in this level */
+
+        //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+
+        double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
+
+        /* Process all the nodes in 'topoLvl': panel updates on CPU */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+                zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+                              comReqss[offset]->U_diag_blk_recv_req,
+                              dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+                zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+                              dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+                              packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        } /* end panel update */
+
+        //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+
+        /* Process all the panels in look-ahead window: 
+	   broadcast L and U panels. */
+        for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+            {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+                                  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+                                  LUvsbs[offset]->Lval_buf, factStat->factored,
+                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                                  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+                                  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        } /* end for panels in look-ahead window */
+
+        //printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
+
+        /* Process all the nodes in level 'topoLvl': Schur complement update
+	   (no MPI communication)  */
+        for (int k0 = k_st; k0 < k_end; ++k0)
+        {
+            int k = perm_c_supno[k0]; // direct computation no perm_c_supno
+            int offset = k0 % numLA;
+
+            double tsch = SuperLU_timer_();
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+            zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU,
+                   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                   grid, LUstruct, SCT);
+            zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+
+            int_t LU_nonempty = zSchurComplementSetupGPU(k,
+                                                         msgss[offset], packLUInfo,
+                                                         myIperm, gIperm_c_supno, perm_c_supno,
+                                                         gEtreeInfo, fNlists, scuBufs,
+                                                         LUvsbs[offset], grid, LUstruct, HyP);
+            // initializing D2H data transfer. D2H = Device To Host.
+            int_t jj_cpu; /* limit between CPU and GPU */
+
+#if 1
+            if (superlu_acc_offload)
+            {
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                /* code */
+                HyP->offloadCondition = 0;
+                jj_cpu = 0;
+            }
+
+#else
+            if (superlu_acc_offload)
+            {
+                jj_cpu = getAccUPartition(HyP);
+
+                if (jj_cpu > 0)
+                    jj_cpu = HyP->num_u_blks_Phi;
+
+                /* Sherry force this --> */
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
+            }
+            else
+            {
+                jj_cpu = 0;
+            }
+#endif
+
+            // int_t jj_cpu = HyP->num_u_blks_Phi-1;
+            // if (HyP->Rnbrow > 0 && jj_cpu>=0)
+            //     HyP->offloadCondition = 1;
+            // else
+            //     HyP->offloadCondition = 0;
+            //     jj_cpu=0;
+#if 0
+	    if ( HyP->offloadCondition ) {
+	    printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n",
+		   grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi ,
+		   HyP->num_u_blks, HyP->num_u_blks_Phi,
+		   jj_cpu, HyP->offloadCondition);
+	    fflush(stdout);
+	    }
+#endif
+            scuStatUpdate(SuperSize(k), HyP, SCT, stat);
+
+            int_t offload_condition = HyP->offloadCondition;
+            uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int *indirect = fNlists->indirect;
+            int *indirect2 = fNlists->indirect2;
+
+            /* Schur Complement Update */
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC(k + 1);
+
+            doublecomplex *bigV = scuBufs->bigV;
+            doublecomplex *bigU = scuBufs->bigU;
+
+            double t1 = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel /* Look-ahead update on CPU */
+#endif
+            {
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub,
+                                               usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub,
+                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j = ij / HyP->RemainBlk;
+                    int_t lb = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+            }     /* end parallel region ... end look-ahead update */
+
+            SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+
+            //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+
+            /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1)
+            { /* Not the root */
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
+                { /* if k is the last child in this level */
+                    int_t k0_parent = myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+                        //	      exit(-1);
+                        assert(k0_parent < nnodes);
+                        int offset = k0_parent - k_end;
+                        if (!gpuLUreduced[k_parent] && superlu_acc_offload)
+                        {
+                            double tt_start1 = SuperLU_timer_();
+
+                            zinitD2Hreduce(k_parent, d2Hred, last_flag,
+                                          HyP, sluGPU, grid, LUstruct, SCT);
+                            int_t copyL_kljb = d2Hred->copyL_kljb;
+                            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                            if (copyL_kljb || copyU_kljb)
+                                SCT->PhiMemCpyCounter++;
+                            zsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU);
+
+                            /* Reduce the LU panels from GPU */
+                            zreduceGPUlu(last_flag, d2Hred,
+                                        sluGPU, SCT, grid, LUstruct);
+
+                            gpuLUreduced[k_parent] = 1;
+                            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                        }
+
+                        /* Factorize diagonal block on CPU */
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+                        zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+                                        dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+                                        comReqss[offset]->U_diag_blk_recv_req,
+                                        comReqss[offset]->L_diag_blk_recv_req,
+                                        comReqss[offset]->U_diag_blk_send_req,
+                                        comReqss[offset]->L_diag_blk_send_req,
+                                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    } /* end if k0_parent > 0 */
+
+                } /* end if all children are done */
+            }     /* end if non-root */
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+            {
+                /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
+#pragma omp master
+#endif
+                {
+                    if (superlu_acc_offload)
+                    {
+#ifdef _OPENMP
+                        int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
+                        double t1 = SuperLU_timer_();
+
+                        if (offload_condition)
+                        {
+                            SCT->datatransfer_count++;
+                            int streamId = k0 % nGPUStreams;
+
+                            /*wait for previous offload to get finished*/
+                            if (sluGPU->lastOffloadStream[streamId] != -1)
+                            {
+                                zwaitGPUscu(streamId, sluGPU, SCT);
+                                sluGPU->lastOffloadStream[streamId] = -1;
+                            }
+
+                            int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow;
+                            int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
+                            assert(bigu_send_size < HyP->bigu_size);
+
+                            /* !! Sherry add the test to avoid seg_fault inside
+                                  sendSCUdataHost2GPU */
+                            if (bigu_send_size > 0)
+                            {
+                                zsendSCUdataHost2GPU(streamId, lsub, usub,
+                                              bigU, bigu_send_size,
+                                              Remain_lbuf_send_size, sluGPU, HyP);
+
+                                sluGPU->lastOffloadStream[streamId] = k0;
+                                int_t usub_len = usub[2];
+                                int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+                                //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+                                zSchurCompUpdate_GPU(
+                                    streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                    Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                    HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid);
+                            } /* endif bigu_send_size > 0 */
+
+                            // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
+
+                            SCT->schurPhiCallCount++;
+                            HyP->jj_cpu = jj_cpu;
+                            updateDirtyBit(k0, HyP, grid);
+                        } /* endif (offload_condition) */
+
+                        double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double)(t2 - t1);                                     // Sherry added
+
+                    } /* endif (superlu_acc_offload) */
+
+                } /* end omp master thread */
+		
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                /* The following update is on CPU. Should not be necessary now,
+		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
+                {
+                    //printf(".. WARNING: should NOT get here\n");
+                    int_t j = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+
+            } /* end omp parallel region */
+
+            //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+            Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req,
+                            comReqss[abs_offset]->L_diag_blk_send_req,
+                            grid, SCT);
+#endif
+
+            /*Schedule next I bcasts within look-ahead window */
+            for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                                      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+                                      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+                                      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf,
+                                      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            } /* end for look-ahead window */
+
+            if (topoLvl < maxTopoLevel - 1) /* not root */
+            {
+                /*look-ahead LU factorization*/
+                int kx_st = eTreeTopLims[topoLvl + 1];
+                int kx_end = eTreeTopLims[topoLvl + 2];
+                for (int k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int kx = perm_c_supno[k0x];
+                    int offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+                            zLPanelTrSolve(kx, factStat->factored_L,
+                                           dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within look-ahead window
+                                factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt,
+                                                  comReqss[offset1]->send_req,
+                                                  comReqss[offset1]->recv_req,
+                                                  LUvsbs[offset1]->Lsub_buf,
+                                                  LUvsbs[offset1]->Lval_buf,
+                                                  factStat->factored,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+                            zUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor,
+                                           scuBufs->bigV,
+                                           ldt, packLUInfo->Ublock_info,
+                                           grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                k0x - k0 < numLA + 1 && // is within lookahead window
+                                factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt,
+                                                  comReqss[offset]->send_requ,
+                                                  comReqss[offset]->recv_requ,
+                                                  LUvsbs[offset]->Usub_buf,
+                                                  LUvsbs[offset]->Uval_buf,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                } /* end look-ahead */
+
+            } /* end if non-root level */
+
+            /* end Schur complement update */
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+        } /* end Schur update for all the nodes in level 'topoLvl' */
+
+    } /* end for all levels of the tree */
+
+    return 0;
+} /* end zsparseTreeFactor_ASYNC_GPU */
+
+#endif // matching: enable GPU
diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c
new file mode 100644
index 00000000..184d0e23
--- /dev/null
+++ b/SRC/ztrfAux.c
@@ -0,0 +1,756 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Auxiliary routine for 3D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
+#include "superlu_zdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* zInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, zLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    zinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t zSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    doublecomplex *bigU,
+    int_t* Lsub_buf,
+    doublecomplex *Lval_buf,
+    int_t* Usub_buf,
+    doublecomplex *Uval_buf,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    doublecomplex* uval;
+    int_t* lsub;
+    doublecomplex* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        zgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* zSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t zSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    zscuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, zLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    doublecomplex* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    doublecomplex* uval;
+    int_t* lsub;
+    doublecomplex* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        zRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            zRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* zSchurComplementSetupGPU */
+
+
+doublecomplex* zgetBigV(int_t ldt, int_t num_threads)
+{
+    doublecomplex *bigV;
+    if (!(bigV = doublecomplexMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    doublecomplex* bigU = doublecomplexMalloc_dist(bigu_size);
+
+    return bigU;
+} /* zgetBigU */
+
+
+trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      zLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter zinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    zinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    zLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
+    zLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit zinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* zinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit zDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
new file mode 100644
index 00000000..3dc098f1
--- /dev/null
+++ b/SRC/ztrfCommWrapper.c
@@ -0,0 +1,547 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     doublecomplex *BlockUFactor,
+                     doublecomplex *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     zLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        zIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        zIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Zgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        zPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        zISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        zISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
+		      doublecomplex* BlockUFactor,
+		      gridinfo_t *grid,
+		      zLUstruct_t *LUstruct)
+{
+    doublecomplex alpha = {1.0, 0.0};
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        doublecomplex* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        doublecomplex* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* zLPanelTrSolve */
+
+int_t zLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    doublecomplex* BlockUFactor,
+                    gridinfo_t *grid,
+                    zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    zUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    zLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* zLPanelUpdate */
+
+#define BL  32
+
+int_t zUPanelTrSolve( int_t k,  
+                     doublecomplex* BlockLFactor,
+                     doublecomplex* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     zLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        doublecomplex *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        doublecomplex *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
+                zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        doublecomplex *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            doublecomplex *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            doublecomplex *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
+                    zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* zUPanelTrSolve */
+
+int_t zUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    doublecomplex* BlockLFactor,
+                    doublecomplex* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    zLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    zUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t zIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    doublecomplex* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        doublecomplex* lusup = Lnzval_bc_ptr[lk];
+
+        zIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(doublecomplex));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            zIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, doublecomplex* Uval_buf,
+    			     gridinfo_t *grid, zLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        doublecomplex* uval = Unzval_br_ptr[lk];
+        zIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(doublecomplex));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            zIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t zWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            zWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t zWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            zWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
index 6ed2eb97..6688710a 100644
--- a/SRC/zutil_dist.c
+++ b/SRC/zutil_dist.c
@@ -13,10 +13,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * <pre>
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include <math.h>
@@ -393,6 +393,7 @@ void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, doublecomp
 
     return;
 }
+/**** end utilities added for SUNDIALS ****/
 
 /*! \brief Allocate storage in ScalePermstruct */
 void zScalePermstructInit(const int_t m, const int_t n,
@@ -421,9 +422,65 @@ void zScalePermstructFree(zScalePermstruct_t *ScalePermstruct)
         SUPERLU_FREE(ScalePermstruct->R);
         SUPERLU_FREE(ScalePermstruct->C);
         break;
+      default: break;
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int zDeAllocGlu_3d(zLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* zDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void
@@ -432,9 +489,14 @@ zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx)
     int  i, j;
     for (j = 0; j < nrhs; ++j)
 	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx].r = 1.0;
-	    else x[i + j*ldx].r = 2.0;
-	    x[i + j*ldx].i = 0.0;
+	    if ( i % 2 ) {
+	        x[i + j*ldx].r = 1.0 + (double)(i+1.)/n;
+		x[i + j*ldx].i = 1.0;
+	    }
+	    else {
+	        x[i + j*ldx].r = 2.0 + (double)(i+1.)/n;
+	        x[i + j*ldx].i = 2.0;
+            }
 	}
 }
 
@@ -557,7 +619,7 @@ void zPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 
 /*! \brief Sets all entries of matrix L to zero.
  */
-void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+void zZeroLblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct)
 {
     doublecomplex zero = {0.0, 0.0};
     register int extra, gb, j, lb, nsupc, nsupr, ncb;
@@ -587,7 +649,7 @@ void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
             }
 	}
     }
-} /* zZeroLblocks */
+} /* end zZeroLblocks */
 
 
 /*! \brief Dump the factored matrix L using matlab triple-let format
@@ -596,8 +658,8 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, zLocalLU_t *Llu)
 {
     register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-	int_t nnzL, n,nmax;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
     int_t *xsup = Glu_persist->xsup;
     int_t *index;
     doublecomplex *nzval;
@@ -650,7 +712,7 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		}
 
 	if(grid->iam==0){
-		fprintf(fp, "%d %d %d\n", n,n,nnzL);
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
 	}
 
      ncb = nsupers / grid->npcol;
@@ -686,7 +748,6 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 } /* zDumpLblocks */
 
 
-
 /*! \brief Print the blocks in the factored matrix U.
  */
 void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
@@ -726,7 +787,37 @@ void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
 	}
     }
-} /* ZPRINTUBLOCKS */
+} /* end zPrintUlocks */
+
+/*! \brief Sets all entries of matrix U to zero.
+ */
+void zZeroUblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    doublecomplex zero = {0.0, 0.0};
+    register int i, extra, lb, len, nrb;
+    register int myrow, r;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    doublecomplex *nzval;
+    int nsupers = Glu_persist->supno[n-1] + 1;
+
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    len = index[1];  // number of entries in nzval[];
+	    for (i = 0; i < len; ++i) {
+	        nzval[i] = zero;
+	    }
+	}
+    }
+} /* end zZeroUlocks */
 
 int
 zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm,
diff --git a/TEST/pdtest.c b/TEST/pdtest.c
index c10fc127..fd0dcf24 100644
--- a/TEST/pdtest.c
+++ b/TEST/pdtest.c
@@ -357,7 +357,7 @@ int main(int argc, char *argv[])
 		        PStatFree(&stat);
 #if 0
 		        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-				     nrhs, b, ldb, xtrue, ldx, &grid);
+				     nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 		        if ( info ) {
 			    printf(FMT3, "pdgssvx",info,izero,n,nrhs,imat,nfail);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[])
 			    dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
 					  &result[2]);
 			    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-					 nrhs, b, ldb, xtrue, ldx, &grid);
+					 nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 
 			    /* Print information about the tests that did
diff --git a/TEST/pztest.c b/TEST/pztest.c
index 6b430749..e3f57bc5 100644
--- a/TEST/pztest.c
+++ b/TEST/pztest.c
@@ -357,7 +357,7 @@ int main(int argc, char *argv[])
 		        PStatFree(&stat);
 #if 0
 		        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-				     nrhs, b, ldb, xtrue, ldx, &grid);
+				     nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 		        if ( info ) {
 			    printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[])
 			    dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
 					  &result[2]);
 			    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-					 nrhs, b, ldb, xtrue, ldx, &grid);
+					 nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 
 			    /* Print information about the tests that did
diff --git a/cmake/XSDKDefaults.cmake b/cmake/XSDKDefaults.cmake
index 20eea804..d454d789 100644
--- a/cmake/XSDKDefaults.cmake
+++ b/cmake/XSDKDefaults.cmake
@@ -98,9 +98,9 @@ SET(USE_XSDK_DEFAULTS  ${USE_XSDK_DEFAULTS_DEFAULT}  CACHE  BOOL
   "Use XSDK defaults and behavior.")
 PRINT_VAR(USE_XSDK_DEFAULTS)
 
-SET(XSDK_ENABLE_C  TRUE)
-SET(XSDK_ENABLE_CXX  TRUE)
-SET(XSDK_ENABLE_Fortran  TRUE)
+SET(XSDK_ENABLE_C  ON)
+SET(XSDK_ENABLE_CXX  ON)
+SET(XSDK_ENABLE_Fortran  ON)
 
 # Handle the compiler and flags for a language
 MACRO(XSDK_HANDLE_LANG_DEFAULTS  CMAKE_LANG_NAME  ENV_LANG_NAME
diff --git a/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh b/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh
new file mode 100644
index 00000000..96784830
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh
@@ -0,0 +1,29 @@
+module unload cray-mpich/7.7.6
+module swap PrgEnv-intel PrgEnv-gnu
+export MKLROOT=/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64
+
+# module use /global/common/software/m3169/cori/modulefiles
+# module unload openmpi
+module load cmake/3.18.2
+module load cuda/10.2.89
+module load openmpi/4.0.3
+
+
+export OMP_NUM_THREADS=1
+export NUM_GPU_STREAMS=1
+# srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/g20.rua
+
+
+export NSUP=128
+export NREL=20
+# for MAT in big.rua 
+# for MAT in g4.rua 
+for MAT in s1_mat_0_126936.bin 
+# for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin 
+# for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
+# for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
+do
+srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../../matrix/$MAT
+done 
+
diff --git a/example_scripts/batch_script_mpi_runit_traverse_sml.sh b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
new file mode 100644
index 00000000..6bae66bf
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+#SBATCH --qos=test
+#SBATCH -N 1
+#SBATCH -t 00:30:00
+#SBATCH -J superlu_test
+#SBATCH --mail-user=liuyangzhuan@lbl.gov
+#SBATCH --gpus=1
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt
+module load cmake
+
+#srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/big.rua
+srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../../matrix/HTS/copter2.mtx
diff --git a/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh b/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
index cced19fb..8be323e0 100644
--- a/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
+++ b/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
@@ -36,10 +36,11 @@ cmake .. \
     -DCMAKE_Fortran_COMPILER=mpif90 \
 	-DCMAKE_INSTALL_PREFIX=. \
 	-DTPL_ENABLE_CUDALIB=ON \
+	-DTPL_ENABLE_LAPACKLIB=ON \
 	-DCMAKE_BUILD_TYPE=Release \
 	-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
 	-DCMAKE_CXX_FLAGS="-Ofast -DRELEASE ${INC_VTUNE}" \
-    -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 ${INC_VTUNE} -DGPU_ACC -I${CUDA_ROOT}/include" \
+    -DCMAKE_C_FLAGS="-std=c11 -DGPU_SOLVE -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 ${INC_VTUNE} -DGPU_ACC -I${CUDA_ROOT}/include" \
 	-DCMAKE_CUDA_FLAGS="--disable-warnings -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DGPU_ACC -gencode arch=compute_70,code=sm_70 -I/usr/common/software/openmpi/4.0.3/gcc/8.3.0/cuda/10.2.89/include"
 make pddrive			
 #	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so"
diff --git a/example_scripts/run_cmake_build_perlmutter_gcc.sh b/example_scripts/run_cmake_build_perlmutter_gcc.sh
new file mode 100644
index 00000000..38cd2bcb
--- /dev/null
+++ b/example_scripts/run_cmake_build_perlmutter_gcc.sh
@@ -0,0 +1,31 @@
+module load PrgEnv-gnu
+module load cpe-cuda
+module load cuda
+module load cmake/git-20210830
+rm -rf build
+mkdir build
+cd build
+cmake .. \
+     -DTPL_PARMETIS_LIBRARIES=ON \
+     -DTPL_PARMETIS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/include;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/metis/include" \
+     -DTPL_PARMETIS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a" \
+     -DTPL_ENABLE_COMBBLASLIB=ON \
+     -DTPL_COMBBLAS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/include;/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/CombBLAS-20211019/Applications/BipartiteMatchings" \
+     -DTPL_COMBBLAS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/lib/libCombBLAS.a" \
+     -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DAdd_" \
+     -DCMAKE_C_COMPILER=cc \
+     -DCMAKE_CXX_COMPILER=CC \
+     -DXSDK_ENABLE_Fortran=ON \
+     -DCMAKE_CUDA_ARCHITECTURES=80 \
+     -DCMAKE_CUDA_FLAGS="-I${MPICH_DIR}/include" \
+     -DTPL_ENABLE_CUDALIB=TRUE \
+     -DTPL_CUDA_LIBRARIES="/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcublas.so;/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcudart.so" \
+     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
+     -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
+     -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \
+     -DBUILD_SHARED_LIBS=OFF \
+     -DCMAKE_INSTALL_PREFIX=. \
+     -DMPIEXEC_NUMPROC_FLAG=-n \
+     -DMPIEXEC_EXECUTABLE=/usr/bin/srun \
+     -DMPIEXEC_MAX_NUMPROCS=16
+make pddrive
\ No newline at end of file
diff --git a/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
new file mode 100644
index 00000000..1189d97b
--- /dev/null
+++ b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Bash script to submit many files to Cori/Edison/Queue
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64 
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt 
+#module load cmake
+
+
+export PATH=/home/yl33/cmake-3.20.3/bin/:$PATH
+cmake --version
+export PARMETIS_ROOT=~/petsc_master/traverse-pgi-openmpi-199-gpucuda-branch/
+
+export CUDA_ROOT=/usr/local/cuda-11.2
+#export CUDA_PATH=${CUDA_ROOT}
+rm -rf CMakeCache.txt
+rm -rf CMakeFiles
+rm -rf CTestTestfile.cmake
+rm -rf cmake_install.cmake
+rm -rf DartConfiguration.tcl 
+
+cmake .. \
+	-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include" \
+	-DTPL_PARMETIS_LIBRARIES="${PARMETIS_ROOT}/lib/libparmetis.a;${PARMETIS_ROOT}/lib/libmetis.a" \
+	-DBUILD_SHARED_LIBS=OFF \
+	-DCMAKE_C_COMPILER=mpicc \
+	-DTPL_ENABLE_CUDALIB=TRUE \
+	-DTPL_ENABLE_LAPACKLIB=TRUE \
+	-Denable_openmp:BOOL=TRUE \
+	-DCMAKE_CUDA_FLAGS="-ccbin pgc++ -D_PGIC_PRINCETON_OVERRIDE_" \
+	-DCMAKE_CUDA_HOST_COMPILER=mpicc \
+	-DCMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES="/usr/local/cuda-11.2/include" \
+	-DCMAKE_INCLUDE_SYSTEM_FLAG_C="-I" \
+	-DCMAKE_CXX_COMPILER=mpiCC \
+	-DCMAKE_INSTALL_PREFIX=. \
+	-DCMAKE_BUILD_TYPE=Release \
+	-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
+        -DTPL_BLAS_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DTPL_LAPACK_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DCMAKE_CXX_FLAGS="-DRELEASE -pgf90libs" \
+        -DCMAKE_C_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -pgf90libs"
+
+make pddrive
+make install
+
+#	-DXSDK_ENABLE_Fortran=FALSE \ 
+
+
+#	-DCUDAToolkit_LIBRARY_ROOT="${CUDA_ROOT}" \
+
+#salloc -N 1 --qos=test -t 0:30:00 --gpus=2
+
+
+
+
+#        -DCMAKE_CUDA_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DGPU_ACC -gencode arch=compute_70,code=sm_70"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so" \
+#        -DCMAKE_CXX_FLAGS="-g -trace -Ofast -std=c++11 -DAdd_ -DRELEASE -tcollect -L$VT_LIB_DIR -lVT $VT_ADD_LIBS" \
+
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_lapack95_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_blas95_lp64.a"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a"  
+
+
+# DCMAKE_BUILD_TYPE=Release or Debug compiler options set in CMAKELIST.txt
+
+#        -DCMAKE_C_FLAGS="-g -O0 -std=c99 -DPRNTlevel=2 -DPROFlevel=1 -DDEBUGlevel=0" \
+        #-DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 ${INC_VTUNE}" \
diff --git a/make.inc.in b/make.inc.in
index 097a38a9..858ab0a3 100644
--- a/make.inc.in
+++ b/make.inc.in
@@ -9,32 +9,42 @@
 #  Creation date:   March 1, 2016	version 5.0.0
 #
 #  Modified:	    October 13, 2017    version 5.2.1
-#		    February 23, 2020   version 6.3.0
+#		    February 20, 2021   version 7.0.0
+#		    October 5, 2021     version 7.1.0
 #
 ############################################################################
 #
 #  The name of the libraries to be created/linked to
 #
 SuperLUroot = ${CMAKE_INSTALL_PREFIX}
-DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
-INCLUDEDIR  = -I$(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@ 
-INCLUDEDIR      += -I$(SuperLUroot)/../SRC
-
-XSDK_INDEX_SIZE=@XSDK_INDEX_SIZE@
-SLU_HAVE_LAPACK=@SLU_HAVE_LAPACK@
-HAVE_PARMETIS=@HAVE_PARMETIS@
-HAVE_COMBBLAS=@HAVE_COMBBLAS@
-HAVE_CUDA=@HAVE_CUDA@
-HAVE_HIP=@HAVE_HIP@
-
-LIBS 	    = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm #-lmpi
-LIBS	    += ${LAPACK_LIB_EXPORT}
-LIBS	    += ${PARMETIS_LIB_EXPORT}
-LIBS 	    += ${COMBBLAS_LIB_EXPORT}
-LIBS 	    += ${EXTRA_LIB_EXPORT}
-LIBS        += ${EXTRA_FLIB_EXPORT}
-CUDALIBS    = ${CUDA_LIB_EXPORT}
-LIBS        += ${CUDA_LIB_EXPORT}
+#DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
+DSUPERLULIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_EXPORT}
+INCLUDEDIR  = $(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@
+
+XSDK_INDEX_SIZE = @XSDK_INDEX_SIZE@
+SLU_HAVE_LAPACK = @SLU_HAVE_LAPACK@
+HAVE_PARMETIS   = @HAVE_PARMETIS@
+HAVE_COMBBLAS   = @HAVE_COMBBLAS@
+HAVE_CUDA       = @HAVE_CUDA@
+HAVE_HIP        = @HAVE_HIP@
+
+XSDK_ENABLE_Fortran = @XSDK_ENABLE_Fortran@
+ifeq ($(XSDK_ENABLE_Fortran),ON)
+  DFORTRANLIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_FORTRAN}
+  LIBS = $(DFORTRANLIB) $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm
+  LIBS += ${EXTRA_FLIB_EXPORT}
+else
+  LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm
+endif
+
+LIBS	 += ${LAPACK_LIB_EXPORT}
+LIBS	 += ${PARMETIS_LIB_EXPORT}
+LIBS 	 += ${COMBBLAS_LIB_EXPORT}
+LIBS 	 += ${EXTRA_LIB_EXPORT}
+# LIBS     += ${CUDA_LIB_EXPORT}
+
+CUDALIBS = ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}
+LIBS     += $(CUDALIBS)
 
 #
 #  The archiver and the flag(s) to use when building archive (library)
@@ -45,17 +55,18 @@ ARCHFLAGS    = cr
 RANLIB       = @CMAKE_RANLIB@
 
 CC           = @CMAKE_C_COMPILER@
-NVCC           = @CMAKE_CUDA_COMPILER@
-CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@
-CUDACFLAGS 	 = @CMAKE_CUDA_FLAGS@
+CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ${SHARED_C_FLAGS_EXPORT}
+##@CMAKE_SHARED_LIBRARY_C_FLAGS@
 #CFLAGS      += -D${DirDefs}
 # CFLAGS     += @COMPILE_DEFINITIONS@
 CXX          = @CMAKE_CXX_COMPILER@
 CXXFLAGS     = @CMAKE_CXX_FLAGS_RELEASE@ @CMAKE_CXX_FLAGS@
 NVCC	     = @CMAKE_CUDA_COMPILER@
-CUDACFLAGS   = @CMAKE_CUDA_FLAGS@
+NVCCFLAGS    = @CMAKE_CUDA_FLAGS@
+
 NOOPTS       = -O0
 FORTRAN	     = @CMAKE_Fortran_COMPILER@
+FFLAGS	     = @CMAKE_Fortran_FLAGS@ @Fortrtan_INCLUDES@
 
 LOADER       = @CMAKE_CXX_COMPILER@
-LOADOPTS     = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@
+LOADOPTS     = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@ @CMAKE_Fortran_LINK_FLAGS@
diff --git a/run_cmake_build.sh b/run_cmake_build.sh
index fa271b7d..b318c0ce 100755
--- a/run_cmake_build.sh
+++ b/run_cmake_build.sh
@@ -7,6 +7,8 @@ then
     
 elif [ "$NERSC_HOST" == "cori" ]
 then
+#    rm -fr 64-build; mkdir 64-build; cd 64-build;
+#    export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3-64
     rm -fr cori-build; mkdir cori-build; cd cori-build;
     export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3
 #    export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/shared-build
@@ -55,7 +57,10 @@ then
     -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=0 -DDEBUGlevel=0" \
     -DCMAKE_C_COMPILER=mpicc \
     -DCMAKE_CXX_COMPILER=mpicxx \
+    -DCMAKE_CXX_FLAGS="-std=c++11" \
     -DCMAKE_Fortran_COMPILER=mpif90 \
+    -DCMAKE_LINKER=mpicxx \
+    -Denable_openmp=ON \
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \