diff --git a/.ci_tests.sh b/.ci_tests.sh new file mode 100755 index 00000000..e9664161 --- /dev/null +++ b/.ci_tests.sh @@ -0,0 +1,31 @@ +#!/bin/sh +set -e + +export RED="\033[31;1m" +export BLUE="\033[34;1m" +export ROOT_DIR="$PWD" +printf "${BLUE} SLU; Entered tests file:\n" + + +export DATA_FOLDER=$ROOT_DIR/EXAMPLE +export EXAMPLE_FOLDER=$ROOT_DIR/build/EXAMPLE +export TEST_FOLDER=$ROOT_DIR/build/TEST + +case "${TEST_NUMBER}" in +1) mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +2) mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +3) mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +4) mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +5) mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +6) mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +7) mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +8) mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;; +9) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;; +10) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;; +11) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;; +12) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;; +13) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;; +14) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;; +15) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive_ABglobal" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;; +*) printf "${RED} ###SLU: Unknown test\n" ;; +esac diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..391f4adc --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,97 @@ +name: Run Github CI tests. + +on: [push, pull_request] + +jobs: + test: + name: Build and test + runs-on: ubuntu-latest + + strategy: + matrix: + compiler: + - gcc + test: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Install dependencies + run: | + export BLUE="\033[34;1m" + mkdir -p installDir + + printf "${BLUE} SLU; Installing gcc-9 via apt\n" + sudo apt-get update + sudo apt-get install build-essential software-properties-common -y + sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y + sudo apt-get update + sudo apt-get install gcc-9 g++-9 -y + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9 + export CXX="g++-9" + export CC="gcc-9" + printf "${BLUE} SLU; Done installing gcc-9 via apt\n" + + printf "${BLUE} SLU; Installing gfortran via apt\n" + sudo apt-get install gfortran-9 -y + sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-9 60 + printf "${BLUE} SLU; Done installing gfortran via apt\n" + + printf "${BLUE} SLU; Installing openmpi\n" + sudo apt-get install openmpi-bin libopenmpi-dev + printf "${BLUE} SLU; Done installing openmpi\n" + + printf "${BLUE} SLU; Installing BLASfrom apt\n" + sudo apt-get install libblas-dev + export BLAS_LIB=/usr/lib/libblas/libblas.so + printf "${BLUE} SLU; Done installing BLASfrom apt\n" + + printf "${BLUE} SLU; Installing LAPACKfrom apt\n" + sudo apt-get install liblapack-dev + export LAPACK_LIB=/usr/lib/liblapack.so + printf "${BLUE} SLU; Done installing LAPACKfrom apt\n" + + printf "${BLUE} SLU; Installing ParMetis-4.0 from source\n" + cd $GITHUB_WORKSPACE/installDir + wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/parmetis/parmetis-4.0.3.tar.gz + tar -xf parmetis-4.0.3.tar.gz + cd parmetis-4.0.3/ + mkdir -p install + make config shared=1 cc=mpicc cxx=mpic++ prefix=$PWD/install + make install > make_parmetis_install.log 2>&1 + printf "${BLUE} SLU; Done installing ParMetis-4.0 from source\n" + + - name: Install package + run: | + export BLUE="\033[34;1m" + printf "${BLUE} SLU; Installing superlu_dist from source\n" + cd $GITHUB_WORKSPACE + rm -rf build + mkdir -p build + cd build + cmake .. \ + -DTPL_PARMETIS_INCLUDE_DIRS="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/metis/include;$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/include" \ + -DTPL_PARMETIS_LIBRARIES="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/lib/libparmetis.so" \ + -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=1" \ + -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \ + -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \ + -DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \ + -Denable_blaslib=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_CXX_COMPILER=mpic++ \ + -DCMAKE_INSTALL_PREFIX=. \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + make + make install + printf "${BLUE} SLU; Done installing superlu_dist from source\n" + + - name: Test + run: | + cd $GITHUB_WORKSPACE + export TEST_NUMBER=${{ matrix.test }} + ./.ci_tests.sh + + diff --git a/.gitignore b/.gitignore index cd5d67bb..39be29c9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,11 @@ # You have to ignore this generated file or git will complain that it is an # unknown file! -/make.inc +make.inc # If the instructions are telling people to create this build dir under the # source tree, you had better put in an ignore for this. -#/build/* +build/* + +# not to commit any changes to the following file +FORTRAN/superlu_dist_config.fh diff --git a/.travis.yml b/.travis.yml index af98f264..3e758231 100644 --- a/.travis.yml +++ b/.travis.yml @@ -111,4 +111,4 @@ install: script: - cd $TRAVIS_BUILD_DIR - - ./.travis_tests.sh + - ./.ci_tests.sh diff --git a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt index 3d259fef..b0be69b7 100644 --- a/CBLAS/CMakeLists.txt +++ b/CBLAS/CMakeLists.txt @@ -46,5 +46,5 @@ endif() add_library(blas ${sources} ${HEADERS}) -install(TARGETS blas DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +install(TARGETS blas DESTINATION "${INSTALL_LIB_DIR}") +install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04dda0b6..6de4c56d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,14 +5,14 @@ ###################################################################### # Required version -cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18.1 FATAL_ERROR) # Project version numbers #project(SuperLU_DIST C CXX CUDA) project(SuperLU_DIST C CXX) -set(VERSION_MAJOR "6") -set(VERSION_MINOR "4") -set(VERSION_BugFix "0") +set(VERSION_MAJOR "7") +set(VERSION_MINOR "1") +set(VERSION_BugFix "1") set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix}) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") @@ -20,19 +20,27 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Set up options option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) +option(enable_single "Enable single precision library" OFF) option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) +option(XSDK_ENABLE_Fortran "Enable Fortran" ON) + +#-- BLAS option(TPL_ENABLE_INTERNAL_BLASLIB "Build the CBLAS library" ${enable_blaslib_DEFAULT}) option(TPL_BLAS_LIBRARIES "List of absolute paths to blas libraries [].") -option(TPL_ENABLE_LAPACKLIB "Enable lapack library" ON) -option(TPL_LAPACK_LIBRARIES "List of absolute paths to lapack libraries [].") +#-- ParMETIS option(TPL_ENABLE_PARMETISLIB "Build the ParMETIS library" ON) option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].") option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].") -option(TPL_ENABLE_COMBBLASLIB "BUILD THE COMBBLAS LIBRARY" OFF) -OPTION(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].") +#-- LAPACK +option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" OFF) +option(TPL_LAPACK_LIBRARIES "List of absolute paths to LAPACK libraries [].") +#-- CombBLAS +option(TPL_ENABLE_COMBBLASLIB "Build the CombBLAS library" OFF) +option(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].") option(TPL_COMBBLAS_INCLUDE_DIRS "List of absolute paths to CombBLAS include directories [].") +#-- CUDA option(TPL_ENABLE_CUDALIB "Enable the CUDA libraries" OFF) option(TPL_ENABLE_HIPLIB "Enable the HIP libraries" OFF) @@ -40,17 +48,30 @@ option(TPL_ENABLE_HIPLIB "Enable the HIP libraries" OFF) # # IDEAS: xSDK standards module #MESSAGE("\nProcess XSDK defaults ...") -#SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired +# SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired #INCLUDE("cmake/XSDKDefaults.cmake") ###################################################################### -INCLUDE(CTest) +include(CTest) +include(CheckLanguage) ###################################################################### # # Usual initialization stuff # ###################################################################### +set(CMAKE_CXX_STANDARD 11) +#set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") + +if (XSDK_ENABLE_Fortran) + enable_language (Fortran) + set(NOFORTRAN FALSE) +endif() + +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ## ???? set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib") #---- For shared library @@ -80,25 +101,17 @@ if (BUILD_SHARED_LIBS) set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.so) SET(CMAKE_EXE_LINKER_FLAGS - "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC") + "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC") if (BUILD_STATIC_LIBS) message("-- SuperLU_DIST will also be built as a static library.") endif() + set(SHARED_C_FLAGS_EXPORT ${CMAKE_SHARED_LIBRARY_C_FLAGS}) else() message("-- SuperLU_DIST will be built as a static library.") set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.a) endif() -set(CMAKE_CXX_STANDARD 11) -#set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") - -if (XSDK_ENABLE_Fortran) - enable_language (Fortran) - set(NOFORTRAN FALSE) -endif() set(SUPERLU_VERSION "${PROJECT_VERSION}") set(SUPERLU_REV "${PROJECT_REV}") @@ -113,13 +126,12 @@ if (NOT CMAKE_INSTALL_PREFIX) set(CMAKE_INSTALL_PREFIX /usr/local) endif() - if(NOT MSVC) include(GNUInstallDirs) set(default_install_inc_dir ${CMAKE_INSTALL_INCLUDEDIR}) set(default_install_lib_dir ${CMAKE_INSTALL_LIBDIR}) set(default_install_bin_dir ${CMAKE_INSTALL_BINDIR}) -else() +else() # for Windows set(default_install_inc_dir "include") set(default_install_lib_dir "lib") set(default_install_bin_dir "bin") @@ -131,13 +143,15 @@ set(INSTALL_BIN_DIR "${default_install_bin_dir}" CACHE STRING "The folder where # Set up required compiler defines and options. ## get_directory_property( DirDefs COMPILE_DEFINITIONS ) -# set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}") -# set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") if(XSDK_INDEX_SIZE EQUAL 64) message("-- Using 64 bit integer for index size.") endif() -set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") +set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "") +set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -g" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") ###################################################################### # @@ -155,12 +169,53 @@ if(MPI_C_FOUND) endif() if (XSDK_ENABLE_Fortran) if(MPI_Fortran_FOUND) - add_definitions(${MPI_Fortran_COMPILE_FLAGS}) - include_directories(${MPI_Fortran_INCLUDE_PATH}) set(CMAKE_Fortran_FLAGS "${MPI_Fortran_COMPILE_FLAGS} ${CMAKE_Fortran_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_Fortran_LINK_FLAGS}") + include_directories(${MPI_Fortran_INCLUDE_PATH}) endif() endif() + +#---- CUDA libraries +if (TPL_ENABLE_CUDALIB) ## want to use cuda + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + message("-- Enabled support for CUDA.") + enable_language(CUDA) + find_package(CUDA REQUIRED) + if (CUDA_FOUND) + if (NOT CMAKE_CUDA_FLAGS) + cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) + endif() + set(HAVE_CUDA TRUE) + set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG" CACHE STRING "") + set(CMAKE_CUDA_FLAGS_DDEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g" CACHE STRING "") + endif() + +# find_package(CUB REQUIRED) + + find_package(CUDAToolkit REQUIRED) + message("-- CUDAToolkit_LIBRARY_ROOT='${CUDAToolkit_LIBRARY_ROOT}'") + if (NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "") + set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") + set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") + else() + message("-- CUDAToolkit_LIBRARY_ROOT empty, not setting CUDA_LIBRARIES") + endif() + + +# # The following make.inc exporting does not work +# set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver) +# # fix up CUDA library names +# string (REPLACE ";" " " CUDA_LIB_STR "${CUDA_LIB}") +# set(CUDA_LIB_EXPORT ${CUDA_LIB_STR}) +# set(HAVE_CUDA TRUE) + # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_CUDA") + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDA") + else() + message("-- CUDA libraries not found.") + endif() +endif() + #--------------------- OpenMP --------------------- if (NOT DEFINED enable_openmp) set(enable_openmp TRUE) @@ -171,9 +226,9 @@ if (enable_openmp) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}") -# On edison, OpenMP_EXE_LINKER_FLAGS is empty -# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# The following causes problem with cmake/3.20.+ +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'") message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'") endif() @@ -275,9 +330,9 @@ else() add_subdirectory(CBLAS) set(BLAS_LIB blas) if (BUILD_SHARED_LIBS) # export to be referenced by downstream makefile - set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so) + set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.so) else() - set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.a) + set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.a) endif() endif() @@ -349,8 +404,6 @@ if(PARMETIS_FOUND) set(HAVE_PARMETIS TRUE) endif() - - #---------------------- Additional C linker library --------- SET(_c_libs ${CMAKE_C_IMPLICIT_LINK_LIBRARIES}) FOREACH(_lib ${_c_libs}) @@ -369,6 +422,16 @@ if (XSDK_ENABLE_Fortran) string (REPLACE ";" " " EXTRA_FLIB_STR "${EXTRA_FLIB}") set(EXTRA_FLIB_EXPORT ${EXTRA_FLIB_STR}) message("-- EXTRA_FLIB_EXPORT='${EXTRA_FLIB_EXPORT}'") + + if (BUILD_SHARED_LIBS) + message("-- superlu_dist_fortran will be built as a dynamic library.") + set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.so) + SET(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + else() + message("-- superlu_dist_fortran will be built as a static library.") + set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.a) + endif() endif() #--------------------- CombBLAS --------------------- @@ -389,6 +452,7 @@ if (TPL_ENABLE_COMBBLASLIB) ## want to use CombBLAS message("-- Enabled support for COMBBLAS") set(COMBBLAS_FOUND TRUE) + set(CMAKE_CXX_STANDARD 14) # CombBLAS requires c++14 set(COMBBLAS_LIB ${TPL_COMBBLAS_LIBRARIES}) # fix up COMBBLAS library names @@ -414,8 +478,9 @@ endif() if (XSDK_ENABLE_Fortran) include(FortranCInterface) FortranCInterface_HEADER(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_FortranCInterface.h - MACRO_NAMESPACE "FC_") + MACRO_NAMESPACE "FC_") FortranCInterface_VERIFY(CXX) + SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") endif() ###################################################################### @@ -458,19 +523,23 @@ if(enable_examples) endif() if (XSDK_ENABLE_Fortran) -# SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lstdc++") - SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") add_subdirectory(FORTRAN) endif() # superlu_dist uses c++11. PUBLIC means that the other codes linking to it need c++11 #target_compile_features(SuperLU_DIST PUBLIC cxx_std_11) +# Generate various configure files with proper definitions # configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc) configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc) + configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/SRC/superlu_dist_config.h) configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h) +# Following is to configure a file for FORTRAN code +configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/FORTRAN/superlu_dist_config.h) + + # Add pkg-config support if(IS_ABSOLUTE ${CMAKE_INSTALL_LIBDIR}) set(pkgconfig_libdir ${CMAKE_INSTALL_LIBDIR}) @@ -481,4 +550,4 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BI install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -#message("CMAKE_CXX_LINK_FLAGS '${CMAKE_CXX_LINK_FLAGS}'") +#message("MPI_Fortran_LINK_FLAGS '${MPI_Fortran_LINK_FLAGS}'") diff --git a/DoxyConfig b/DoxyConfig index e50bb432..2b18411f 100644 --- a/DoxyConfig +++ b/DoxyConfig @@ -31,8 +31,8 @@ PROJECT_NAME = SuperLU Distributed # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 6.3.0 -e +PROJECT_NUMBER = 7.1.0 + # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location @@ -847,7 +847,7 @@ TREEVIEW_WIDTH = 250 # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. -GENERATE_LATEX = NO +GENERATE_LATEX = YES # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be @@ -858,7 +858,7 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. -LATEX_CMD_NAME = latex +LATEX_CMD_NAME = pdflatex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt old mode 100644 new mode 100755 index 1253eb62..b4c6b371 --- a/EXAMPLE/CMakeLists.txt +++ b/EXAMPLE/CMakeLists.txt @@ -37,6 +37,7 @@ if(enable_double) set(DEXM pddrive.c dcreate_matrix.c) add_executable(pddrive ${DEXM}) target_link_libraries(pddrive ${all_link_libs}) + install(TARGETS pddrive RUNTIME DESTINATION "${INSTALL_BIN_DIR}") set(DEXM1 pddrive1.c dcreate_matrix.c) add_executable(pddrive1 ${DEXM1}) @@ -57,6 +58,22 @@ if(enable_double) add_executable(pddrive4 ${DEXM4}) target_link_libraries(pddrive4 ${all_link_libs}) + set(DEXM3D pddrive3d.c dcreate_matrix.c dcreate_matrix3d.c) + add_executable(pddrive3d ${DEXM3D}) + target_link_libraries(pddrive3d ${all_link_libs}) + + set(DEXM3D1 pddrive3d1.c dcreate_matrix.c dcreate_matrix3d.c) + add_executable(pddrive3d1 ${DEXM3D1}) + target_link_libraries(pddrive3d1 ${all_link_libs}) + + set(DEXM3D2 pddrive3d2.c dcreate_matrix.c dcreate_matrix3d.c) + add_executable(pddrive3d2 ${DEXM3D2}) + target_link_libraries(pddrive3d2 ${all_link_libs}) + + set(DEXM3D3 pddrive3d3.c dcreate_matrix.c dcreate_matrix3d.c) + add_executable(pddrive3d3 ${DEXM3D3}) + target_link_libraries(pddrive3d3 ${all_link_libs}) + set(DEXMG pddrive_ABglobal.c) add_executable(pddrive_ABglobal ${DEXMG}) target_link_libraries(pddrive_ABglobal ${all_link_libs}) @@ -80,10 +97,53 @@ if(enable_double) set(DEXMS pddrive_spawn.c dcreate_matrix.c) add_executable(pddrive_spawn ${DEXMS}) target_link_libraries(pddrive_spawn ${all_link_libs}) - - -endif() - + install(TARGETS pddrive_spawn RUNTIME DESTINATION "${INSTALL_BIN_DIR}") + + + +endif() #### end enable_double + +if(enable_single) + set(SEXM psdrive.c screate_matrix.c) + add_executable(psdrive ${SEXM}) + target_link_libraries(psdrive ${all_link_libs}) + + set(SEXM1 psdrive1.c screate_matrix.c) + add_executable(psdrive1 ${SEXM1}) + target_link_libraries(psdrive1 ${all_link_libs}) + add_superlu_dist_example(psdrive1 big.rua 2 2) + + set(SEXM2 psdrive2.c screate_matrix.c screate_matrix_perturbed.c) + add_executable(psdrive2 ${SEXM2}) + target_link_libraries(psdrive2 ${all_link_libs}) + add_superlu_dist_example(psdrive2 big.rua 2 2) + + set(SEXM3 psdrive3.c screate_matrix.c) + add_executable(psdrive3 ${SEXM3}) + target_link_libraries(psdrive3 ${all_link_libs}) + add_superlu_dist_example(psdrive3 big.rua 2 2) + + set(SEXM4 psdrive4.c screate_matrix.c) + add_executable(psdrive4 ${SEXM4}) + target_link_libraries(psdrive4 ${all_link_libs}) + + set(SEXM3D psdrive3d.c screate_matrix.c screate_matrix3d.c) + add_executable(psdrive3d ${SEXM3D}) + target_link_libraries(psdrive3d ${all_link_libs}) + + set(SEXM3D1 psdrive3d1.c screate_matrix.c screate_matrix3d.c) + add_executable(psdrive3d1 ${SEXM3D1}) + target_link_libraries(psdrive3d1 ${all_link_libs}) + + set(SEXM3D2 psdrive3d2.c screate_matrix.c screate_matrix3d.c) + add_executable(psdrive3d2 ${SEXM3D2}) + target_link_libraries(psdrive3d2 ${all_link_libs}) + + set(SEXM3D3 psdrive3d3.c screate_matrix.c screate_matrix3d.c) + add_executable(psdrive3d3 ${SEXM3D3}) + target_link_libraries(psdrive3d3 ${all_link_libs}) + +endif() #### end enable_single if(enable_complex16) @@ -110,6 +170,22 @@ if(enable_complex16) add_executable(pzdrive4 ${ZEXM4}) target_link_libraries(pzdrive4 ${all_link_libs}) + set(ZEXM3D pzdrive3d.c zcreate_matrix.c zcreate_matrix3d.c) + add_executable(pzdrive3d ${ZEXM3D}) + target_link_libraries(pzdrive3d ${all_link_libs}) + + set(ZEXM3D1 pzdrive3d1.c zcreate_matrix.c zcreate_matrix3d.c) + add_executable(pzdrive3d1 ${ZEXM3D1}) + target_link_libraries(pzdrive3d1 ${all_link_libs}) + + set(ZEXM3D2 pzdrive3d2.c zcreate_matrix.c zcreate_matrix3d.c) + add_executable(pzdrive3d2 ${ZEXM3D2}) + target_link_libraries(pzdrive3d2 ${all_link_libs}) + + set(ZEXM3D3 pzdrive3d3.c zcreate_matrix.c zcreate_matrix3d.c) + add_executable(pzdrive3d3 ${ZEXM3D3}) + target_link_libraries(pzdrive3d3 ${all_link_libs}) + set(ZEXMG pzdrive_ABglobal.c) add_executable(pzdrive_ABglobal ${ZEXMG}) target_link_libraries(pzdrive_ABglobal ${all_link_libs}) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 0af33c41..6fdf47e2 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -30,13 +30,18 @@ ####################################################################### include ../make.inc -DEXM = pddrive.o dcreate_matrix.o sp_ienv.o #pdgstrf2.o -#pdgssvx.o -# pdgstrs_lsum_X1.o pdgstrf_X1.o +DEXM = pddrive.o dcreate_matrix.o #pdgstrf2.o DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o + +DEXM3D = pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o +DEXM3D1 = pddrive3d1.o dcreate_matrix.o dcreate_matrix3d.o +DEXM3D2 = pddrive3d2.o dcreate_matrix.o dcreate_matrix3d.o +DEXM3D3 = pddrive3d3.o dcreate_matrix.o dcreate_matrix3d.o + +# dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o DEXMG = pddrive_ABglobal.o DEXMG1 = pddrive1_ABglobal.o DEXMG2 = pddrive2_ABglobal.o @@ -48,6 +53,11 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D1 = pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D2 = pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D3 = pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o + ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o @@ -58,14 +68,16 @@ ZEXMG4 = pzdrive4_ABglobal.o all: double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ + pddrive3d pddrive3d1 pddrive3d2 pddrive3d3 \ pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \ pddrive3_ABglobal pddrive4_ABglobal complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \ + pzdrive3d pzdrive3d1 pzdrive3d2 pzdrive3d3 \ pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \ pzdrive3_ABglobal pzdrive4_ABglobal -pddrive: $(DEXM) $(DSUPERLULIB) +pddrive: $(DEXM) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@ pddrive1: $(DEXM1) $(DSUPERLULIB) @@ -80,6 +92,18 @@ pddrive3: $(DEXM3) $(DSUPERLULIB) pddrive4: $(DEXM4) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@ +pddrive3d: $(DEXM3D) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@ + +pddrive3d1: $(DEXM3D1) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D1) $(LIBS) -lm -o $@ + +pddrive3d2: $(DEXM3D2) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D2) $(LIBS) -lm -o $@ + +pddrive3d3: $(DEXM3D3) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D3) $(LIBS) -lm -o $@ + pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@ @@ -113,6 +137,18 @@ pzdrive3: $(ZEXM3) $(DSUPERLULIB) pzdrive4: $(ZEXM4) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@ +pzdrive3d: $(ZEXM3D) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D) $(LIBS) -lm -o $@ + +pzdrive3d1: $(ZEXM3D1) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D1) $(LIBS) -lm -o $@ + +pzdrive3d2: $(ZEXM3D2) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D2) $(LIBS) -lm -o $@ + +pzdrive3d3: $(ZEXM3D3) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D3) $(LIBS) -lm -o $@ + pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@ diff --git a/EXAMPLE/README b/EXAMPLE/README index b8fa138c..110484d2 100644 --- a/EXAMPLE/README +++ b/EXAMPLE/README @@ -1,11 +1,25 @@ - SuperLU_DIST EXAMPLES - ====================== + SuperLU_DIST EXAMPLES + ===================== This directory contains sample programs to illustrate how to use various functions provided in SuperLU_DIST. You can modify these examples to suit your applications. +(double real: pddrive* + double complex: pzdrive* ) The examples illustrate the following functionalities: + 0. pddrive3d.c: (invoke new communication-avoiding 3D algorithms) + Use PxGSSVX3D with the default options to solve a linear system + pddrive3d1.c: + Use PxGSSVX3D to solve the systems with same A but different + right-hand side. (Reuse the factored form of A) + pddrive3d2.c: + Use PxGSSVX3D to solve the systems with same sparsity pattern + of A. (Reuse the sparsity ordering) + pddrive3d2.c: + Use PxGSSVX3D to solve the systems with same sparsity pattern + and similar values. (Reuse sparsity ordering and row pivoting) + 1. pddrive.c, pddrive_ABglobal.c Use PDGSSVX with the full (default) options to solve a linear system. 2. pddrive1.c, pddrive1_ABglobal.c @@ -16,17 +30,22 @@ The examples illustrate the following functionalities: (Reuse the sparsity ordering) 4. pddrive3.c, pddrive3_ABglobal.c Solve the systems with the same sparsity pattern and similar values. + (Reuse sparsity ordering and row pivoting) 5. pddrive4.c, pddrive4_ABglobal.c Divide the processors into two subgroups (two grids) such that each subgroup solves a linear system independently from the other. The command line options "-r " and "-c " -defines the 2-D process grid. The total number of processes is: +defines the 2D process grid. The total number of processes is: = * If the options is not provided at the command line, the programs will use 1 processor as default in each case. +In the 3D code pddrive3d, the command line + "-r ", "-c " and "-d " +defines the 3D process grid. The Z-dimension must be power of two. + Three input matrices (Harwell-Boeing format) are provided in this directory: g20.rua -- a real matrix of dimension 400x400 big.rua -- a real matrix of dimension 4960x4960 @@ -36,6 +55,10 @@ The command lines given below show how to run the parallel programs using "mpiexec". You may need to replace mpiexec by platform specific command. +0. To run the 3D example (pddrive3d), type + % mpiexec -n pddrive3d -r -c -d g20.rua + (e.g., mpiexec -n 8 pddrive3d -r 2 -c 2 -d 2 g20.rua) + 1. To run the real examples (pddrive, pddrive1, etc.) you may type: % mpiexec -n pddrive -r -c g20.rua diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c index 1784949b..8486f419 100644 --- a/EXAMPLE/dcreate_matrix.c +++ b/EXAMPLE/dcreate_matrix.c @@ -235,7 +235,6 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs, } - int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs, int *ldb, double **x, int *ldx, FILE *fp, char * postfix, gridinfo_t *grid) diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c new file mode 100644 index 00000000..6e9ccc21 --- /dev/null +++ b/EXAMPLE/dcreate_matrix3d.c @@ -0,0 +1,463 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * 
+ */ +#include +#include "superlu_ddefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix_postfix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if (!strcmp(postfix, "rua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/dnrformat_loc3d.c b/EXAMPLE/dnrformat_loc3d.c new file mode 100644 index 00000000..1fbaca69 --- /dev/null +++ b/EXAMPLE/dnrformat_loc3d.c @@ -0,0 +1,404 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void dGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ double *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->rowptr[0] = 0;
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	
+	if (grid3d->zscp.Iam == 0) {
+		A2d->rowptr[0] = 0;
+		
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	} /* end 2D layer grid-0 */
+		    
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	double *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	double *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+		    b_counts_int, b_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, p;
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    double *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    double *Btmp; // on 3D grid
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
+		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+	*/
+        MPI_Request recv_req;
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep;
+
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* dScatter_B3d */
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 4107abaa..10bc8091 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -23,7 +23,6 @@ at the top-level directory.
 
 #include 
 #include "superlu_ddefs.h"
-//#include "superlu_zdefs.h"
 
 /*! \brief
  *
@@ -140,7 +139,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( (iam >= nprow * npcol) || (iam == -1) ) goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -198,14 +197,16 @@ int main(int argc, char *argv[])
 	options.DiagInv           = NO;
      */
     set_default_options_dist(&options);
-	options.IterRefine = NOREFINE;
-	options.DiagInv = YES;
-    options.ReplaceTinyPivot  = YES;
+	// options.IterRefine = NOREFINE;
+	// options.DiagInv = YES;
+    // options.ReplaceTinyPivot  = YES;
+    
 	// options.Equil = NO; 
 	// options.ColPerm = NATURAL;
 	// options.RowPerm = NOROWPERM;  						  
 #if 0
-    options.RowPerm = LargeDiag_HWPM;
+    options.RowPerm           = LargeDiag_HWPM;
+    options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
@@ -232,10 +233,16 @@ int main(int argc, char *argv[])
     pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
 	    &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
-
-    /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
+		         nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
@@ -248,9 +255,7 @@ int main(int argc, char *argv[])
     dScalePermstructFree(&ScalePermstruct);
     dDestroy_LU(n, &grid, &LUstruct);
     dLUstructFree(&LUstruct);
-    if ( options.SolveInitialized ) {
-        dSolveFinalize(&options, &SOLVEstruct);
-    }
+    dSolveFinalize(&options, &SOLVEstruct);
     SUPERLU_FREE(b);
     SUPERLU_FREE(xtrue);
     SUPERLU_FREE(berr);
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 8e5396a9..f058bd40 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -14,10 +14,11 @@ at the top-level directory.
  * \brief Driver program for PDGSSVX example
  *
  * 
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * 
*/ @@ -33,7 +34,8 @@ at the top-level directory. * The driver program PDDRIVE1. * * This example illustrates how to use PDGSSVX to - * solve systems with the same A but different right-hand side. + * solve systems with the same A but different right-hand side, + * possibly with different number of right-hand sides. * In this case, we factorize A only once in the first call to * PDGSSVX, and reuse the following data structures * in the subsequent call to PDGSSVX: @@ -54,8 +56,8 @@ int main(int argc, char *argv[]) dSOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; - double *b, *xtrue, *b1; - int i, j, m, n; + double *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c, *postfix; @@ -65,7 +67,7 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ + nrhs = 3; /* Max. number of right-hand sides. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. @@ -104,7 +106,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -141,14 +143,24 @@ int main(int argc, char *argv[]) dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); - for (j = 0; j < nrhs; ++j) - for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; + if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); + m = A.nrow; + n = A.ncol; + m_loc = ((NRformat_loc *)A.Store)->m_loc; + /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. ------------------------------------------------------------*/ /* Set the default input options: @@ -171,9 +183,6 @@ int main(int argc, char *argv[]) fflush(stdout); } - m = A.nrow; - n = A.ncol; - /* Initialize ScalePermstruct and LUstruct. */ dScalePermstructInit(m, n, &ScalePermstruct); dLUstructInit(n, &LUstruct); @@ -182,41 +191,90 @@ int main(int argc, char *argv[]) PStatInit(&stat); /* Call the linear equation solver. */ + nrhs = 1; pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ + nrhs = 1; pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + pdgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + pdgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ - PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); dScalePermstructFree(&ScalePermstruct); dDestroy_LU(n, &grid, &LUstruct); @@ -226,6 +284,7 @@ int main(int argc, char *argv[]) } SUPERLU_FREE(b); SUPERLU_FREE(b1); + SUPERLU_FREE(b2); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); fclose(fp); diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c index ad74edc6..7686b79c 100644 --- a/EXAMPLE/pddrive1_ABglobal.c +++ b/EXAMPLE/pddrive1_ABglobal.c @@ -72,6 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { @@ -105,8 +111,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c index 5ad5a3ae..6bdc7007 100644 --- a/EXAMPLE/pddrive2.c +++ b/EXAMPLE/pddrive2.c @@ -33,8 +33,8 @@ at the top-level directory. * * The driver program PDDRIVE2. * - * This example illustrates how to use to solve - * systems repeatedly with the same sparsity pattern of matrix A. + * This example illustrates how to use PDGSSVX to solve systems + * repeatedly with the same sparsity pattern of matrix A. * In this case, the column permutation vector ScalePermstruct->perm_c is * computed once. The following data structures will be reused in the * subsequent call to PDGSSVX: @@ -116,7 +116,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -147,7 +147,8 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. ------------------------------------------------------------*/ dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - + fclose(fp); + if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); m = A.nrow; @@ -156,7 +157,7 @@ int main(int argc, char *argv[]) m_loc = Astore->m_loc; /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: @@ -190,20 +191,27 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ dDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with - the L and U matrices. */ - SUPERLU_FREE(b); /* Free storage of right-hand side. */ - SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER LINEAR SYSTEM. - ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. ------------------------------------------------------------*/ options.Fact = SamePattern; @@ -218,18 +226,25 @@ int main(int argc, char *argv[]) /* Get the matrix from file, perturbed some diagonal entries to force a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); - dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid); - + dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, postfix, &grid); + PStatInit(&stat); /* Initialize the statistics variables. */ /* Solve the linear system. */ pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } #if ( PRNTlevel>=2 ) if (iam==0) { PrintInt10("new perm_r", m, ScalePermstruct.perm_r); diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c index fad13d43..e908a6ca 100644 --- a/EXAMPLE/pddrive2_ABglobal.c +++ b/EXAMPLE/pddrive2_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -105,8 +110,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c index 4287cae0..d3d6683c 100644 --- a/EXAMPLE/pddrive3.c +++ b/EXAMPLE/pddrive3.c @@ -35,9 +35,9 @@ at the top-level directory. * This example illustrates how to use PDGSSVX to solve * systems repeatedly with the same sparsity pattern and similar * numerical values of matrix A. - * In this case, the column permutation vector and symbolic factorization are - * computed only once. The following data structures will be reused in the - * subsequent call to PDGSSVX: + * In this case, the row and column permutation vectors and symbolic + * factorization are computed only once. The following data structures + * will be reused in the subsequent call to PDGSSVX: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -206,8 +206,15 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -231,8 +238,9 @@ int main(int argc, char *argv[]) nzval1[0] += 1.0e-8; } - /* Zero the numerical values in L. */ + /* Zero the numerical values in L and U. */ dZeroLblocks(iam, n, &grid, &LUstruct); + dZeroUblocks(iam, n, &grid, &LUstruct); dCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, nzval1, colind1, rowptr1, @@ -242,16 +250,23 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) - printf("Solve a system with the same pattern and similar values.\n"); - pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ - DEALLOCATE STORAGE. + DEALLOCATE ALL STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c index 775dc5a7..e20c664d 100644 --- a/EXAMPLE/pddrive3_ABglobal.c +++ b/EXAMPLE/pddrive3_ABglobal.c @@ -78,7 +78,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -111,8 +116,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c new file mode 100644 index 00000000..02f4837c --- /dev/null +++ b/EXAMPLE/pddrive3d.c @@ -0,0 +1,420 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D.
+ *
+ * This example illustrates how to use PDGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

pddrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c new file mode 100644 index 00000000..b1a553a4 --- /dev/null +++ b/EXAMPLE/pddrive3d1.c @@ -0,0 +1,448 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D1.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PDGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PDGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d1 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive3d2.c b/EXAMPLE/pddrive3d2.c new file mode 100644 index 00000000..5fed6157 --- /dev/null +++ b/EXAMPLE/pddrive3d2.c @@ -0,0 +1,424 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D2.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PDGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int dcreate_matrix_perturbed + (SuperMatrix *, int, double **, int *, double **, int *, + FILE *, gridinfo_t *); + extern int dcreate_matrix_perturbed_postfix + (SuperMatrix *, int, double **, int *, double **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + dDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive3d3.c b/EXAMPLE/pddrive3d3.c new file mode 100644 index 00000000..3a7ccb59 --- /dev/null +++ b/EXAMPLE/pddrive3d3.c @@ -0,0 +1,430 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D3.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PDGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PDGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + dZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + dZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index d7289de5..6cbac44f 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -61,7 +61,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n; int nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ int ii, omp_mpi_level; @@ -130,7 +130,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); @@ -191,9 +191,16 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid1); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); @@ -256,10 +263,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid2); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c index f870de85..2cf76078 100644 --- a/EXAMPLE/pddrive4_ABglobal.c +++ b/EXAMPLE/pddrive4_ABglobal.c @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ char trans[1]; @@ -71,6 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); @@ -126,7 +132,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c index d6719e41..3541ab92 100644 --- a/EXAMPLE/pddrive_ABglobal.c +++ b/EXAMPLE/pddrive_ABglobal.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -106,8 +111,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( VAMPIR>=1 ) VT_traceoff(); diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 47b04729..b119b46e 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) @@ -151,7 +156,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -247,7 +252,7 @@ int main(int argc, char *argv[]) /* Check the accuracy of the solution. */ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -265,7 +270,7 @@ int main(int argc, char *argv[]) result[1] = total * 1e-6; if (!iam) { printf("returning data:\n" - " Factor time : %8.2f | Total MEM : %8.2f\n", + " Factor time : %8.2f\n Total MEM : %8.2f\n", stat.utime[FACT], total * 1e-6); printf("**************************************************\n"); fflush(stdout); @@ -302,6 +307,7 @@ int main(int argc, char *argv[]) RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: +if(parent!=MPI_COMM_NULL) MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent); superlu_gridexit(&grid); @@ -309,7 +315,7 @@ int main(int argc, char *argv[]) TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ - + if(parent!=MPI_COMM_NULL) MPI_Comm_disconnect(&parent); MPI_Finalize(); diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c new file mode 100644 index 00000000..72db7f96 --- /dev/null +++ b/EXAMPLE/psdrive.c @@ -0,0 +1,291 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * December 6, 2018
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE.
+ *
+ * This example illustrates how to use PSGSSVX with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive -r  -c  big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *xtrue; + int m, n; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + int cpp_defs(); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + //MPI_Init( &argc, &argv ); + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( VTUNE>=1 ) + __itt_pause(); +#endif + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %4d)\n", nprow); + printf("\t-c : process columns (default %4d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + if(grid.iam==0){ + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( (iam >= nprow * npcol) || (iam == -1) ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid, &LUstruct); + sLUstructFree(&LUstruct); + sSolveFinalize(&options, &SOLVEstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive1.c b/EXAMPLE/psdrive1.c new file mode 100644 index 00000000..ac8f1d75 --- /dev/null +++ b/EXAMPLE/psdrive1.c @@ -0,0 +1,327 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * January 4 2020
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE1.
+ *
+ * This example illustrates how to use PSGSSVX to
+ * solve systems with the same A but different right-hand side,
+ * possibly with different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to
+ * PSGSSVX, and reuse the following data structures
+ * in the subsequent call to PSGSSVX:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive1 -r  -c  big.rua
+ * 
+ */ +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 3; /* Max. number of right-hand sides. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc; + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + nrhs = 1; + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + psgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + psgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + psinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(b1); + SUPERLU_FREE(b2); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive1_ABglobal.c b/EXAMPLE/psdrive1_ABglobal.c new file mode 100644 index 00000000..2ae6fedb --- /dev/null +++ b/EXAMPLE/psdrive1_ABglobal.c @@ -0,0 +1,281 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive1_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * psgssvx_ABglobal, and reuse the following data structures
+ * in the subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive1_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *b, *b1, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) + goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]"); + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + if ( !(xtrue = floatMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve the system with a different B.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(b1); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c new file mode 100644 index 00000000..ffc93170 --- /dev/null +++ b/EXAMPLE/psdrive2.c @@ -0,0 +1,311 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * December 31, 2016 version 5.1.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE2.
+ *
+ * This example illustrates how to use PSGSSVX to solve systems
+ * repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive2 -r  -c  g20.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + NRformat_loc *Astore; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int_t *colind, *colind1, *rowptr, *rowptr1; + int_t i, j, m, n, nnz_loc, m_loc; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + /* prototypes */ + extern int screate_matrix_perturbed + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, gridinfo_t *); + extern int screate_matrix_perturbed_postfix + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %4d)\n", nprow); + printf("\t-c : process columns (default %4d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc; + + /* ------------------------------------------------------------ + 1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + + if (iam==0) { + print_options_dist(&options); +#if ( PRNTlevel>=2 ) + PrintInt10("perm_r", m, ScalePermstruct.perm_r); + PrintInt10("perm_c", n, ScalePermstruct.perm_c); +#endif + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, postfix, &grid); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Solve the linear system. */ + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } +#if ( PRNTlevel>=2 ) + if (iam==0) { + PrintInt10("new perm_r", m, ScalePermstruct.perm_r); + PrintInt10("new perm_c", n, ScalePermstruct.perm_c); + } +#endif + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue1); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} + + diff --git a/EXAMPLE/psdrive2_ABglobal.c b/EXAMPLE/psdrive2_ABglobal.c new file mode 100644 index 00000000..fe4e5a29 --- /dev/null +++ b/EXAMPLE/psdrive2_ABglobal.c @@ -0,0 +1,298 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive2_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once.  The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive2_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *a1, *b, *b1, *xtrue; + int_t *asub, *asub1, *xa, *xa1; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + Process 0 reads the matrix A, and then broadcasts it to all + the other processes. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + /* Save a copy of the right-hand side. */ + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* Save a copy of the matrix A. */ + sallocateA_dist(n, nnz, &a1, &asub1, &xa1); + for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } + for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; + + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1, + SLU_NC, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve the system with the same sparsity pattern.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c new file mode 100644 index 00000000..eaa2e989 --- /dev/null +++ b/EXAMPLE/psdrive3.c @@ -0,0 +1,319 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3.
+ *
+ * This example illustrates how to use PSGSSVX to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PSGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX, and reused in the subsequent call.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive3 -r  -c  big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + NRformat_loc *Astore; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *b1, *xtrue, *nzval, *nzval1; + int_t *colind, *colind1, *rowptr, *rowptr1; + int_t i, j, m, n, nnz_loc, m_loc, fst_row; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iinnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nzval = Astore->nzval; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval1 = floatMalloc_dist(nnz_loc); + colind1 = intMalloc_dist(nnz_loc); + rowptr1 = intMalloc_dist(m_loc+1); + for (i = 0; i < nnz_loc; ++i) { + nzval1[i] = nzval[i]; + colind1[i] = colind[i]; + } + for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i]; + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR + NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Set up the local A in NR_loc format */ + + /* Perturb the 1st diagonal of the matrix to larger value. + Intention is to change values of A. */ + if (iam == 0) { + } + + /* Zero the numerical values in L and U. */ + sZeroLblocks(iam, n, &grid, &LUstruct); + sZeroUblocks(iam, n, &grid, &LUstruct); + + sCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, + nzval1, colind1, rowptr1, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3_ABglobal.c b/EXAMPLE/psdrive3_ABglobal.c new file mode 100644 index 00000000..ee06fa2c --- /dev/null +++ b/EXAMPLE/psdrive3_ABglobal.c @@ -0,0 +1,305 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive3A_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to psgssvx_ABglobal, and reused in the subsequent call.
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive3_ABglobal -r  -c    -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *a1, *b, *b1, *xtrue; + int_t *asub, *asub1, *xa, *xa1; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + /* Save a copy of the right-hand side. */ + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* Save a copy of the matrix A. */ + sallocateA_dist(n, nnz, &a1, &asub1, &xa1); + for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } + for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; + + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR + NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1, + SLU_NC, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve a system with the same pattern and similar values.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d.c b/EXAMPLE/psdrive3d.c new file mode 100644 index 00000000..6cbc21e4 --- /dev/null +++ b/EXAMPLE/psdrive3d.c @@ -0,0 +1,420 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D.
+ *
+ * This example illustrates how to use PSGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

psdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = floatMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c new file mode 100644 index 00000000..c206d2fe --- /dev/null +++ b/EXAMPLE/psdrive3d1.c @@ -0,0 +1,448 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D1.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PSGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PSGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d1 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = floatMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = floatMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = floatMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d2.c b/EXAMPLE/psdrive3d2.c new file mode 100644 index 00000000..d6e0adef --- /dev/null +++ b/EXAMPLE/psdrive3d2.c @@ -0,0 +1,424 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D2.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int screate_matrix_perturbed + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, gridinfo_t *); + extern int screate_matrix_perturbed_postfix + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + sDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d3.c b/EXAMPLE/psdrive3d3.c new file mode 100644 index 00000000..89095fc5 --- /dev/null +++ b/EXAMPLE/psdrive3d3.c @@ -0,0 +1,430 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D3.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PSGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + sZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + sZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive4.c b/EXAMPLE/psdrive4.c new file mode 100644 index 00000000..55057ca2 --- /dev/null +++ b/EXAMPLE/psdrive4.c @@ -0,0 +1,315 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This example illustrates how to divide up the processes into subgroups + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE4.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n 10 psdrive4 big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid1, grid2; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n; + int nprow, npcol, ldumap, p; + int usermap[6]; + int iam, info, ldb, ldx, nprocs; + int nrhs = 1; /* Number of right-hand side. */ + int ii, omp_mpi_level; + char **cpp, c, *postfix; + FILE *fp, *fopen(); + int cpp_defs(); + + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); + if ( nprocs < 10 ) { + fprintf(stderr, "Requires at least 10 processes\n"); + exit(-1); + } + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 1. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 3; + ldumap = 2; + p = 0; /* Grid 1 starts from process 0. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1); + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 2. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 2; + ldumap = 2; + p = 6; /* Grid 2 starts from process 6. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2); + + /* Bail out if I do not belong in any of the 2 grids. */ + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;ii= 0 && iam < 6 ) { /* I am in grid 1. */ + iam = grid1.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------*/ + screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid1); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid1, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + + } else { /* I am in grid 2. */ + iam = grid2.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------*/ + screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = MMD_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid2); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid2, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + } + + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRIDS. + ------------------------------------------------------------*/ + superlu_gridexit(&grid1); + superlu_gridexit(&grid2); + +out: + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} diff --git a/EXAMPLE/psdrive4_ABglobal.c b/EXAMPLE/psdrive4_ABglobal.c new file mode 100644 index 00000000..44736a07 --- /dev/null +++ b/EXAMPLE/psdrive4_ABglobal.c @@ -0,0 +1,361 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This example illustrates how to divide up the processes into subgroups + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive4_ABglobal.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive4_ABglobal  -procs 10
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid1, grid2; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n, nnz; + int_t nprow, npcol, ldumap, p; + int usermap[6]; + int iam, info, ldb, ldx, nprocs; + int nrhs = 1; /* Number of right-hand side. */ + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); + if ( nprocs < 10 ) { + fprintf(stderr, "Requires at least 10 processes\n"); + exit(-1); + } + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 1. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 3; + ldumap = 2; + p = 0; /* Grid 1 starts from process 0. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1); + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 2. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 2; + ldumap = 2; + p = 6; /* Grid 2 starts from process 6. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2); + + /* Bail out if I do not belong in any of the 2 grids. */ + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */ + iam = grid1.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid1.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid1.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1); + } + + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid1); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid1, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + + } else { /* I am in grid 2. */ + iam = grid2.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid2.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid2.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = MMD_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2); + } + + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid2); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid2, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + } + + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRIDS. + ------------------------------------------------------------*/ + superlu_gridexit(&grid1); + superlu_gridexit(&grid2); + +out: + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} diff --git a/EXAMPLE/psdrive_ABglobal.c b/EXAMPLE/psdrive_ABglobal.c new file mode 100644 index 00000000..7d7cdc91 --- /dev/null +++ b/EXAMPLE/psdrive_ABglobal.c @@ -0,0 +1,256 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx_ABglobal
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index bf6c3e9b..493997b2 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -138,7 +138,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( (iam >= nprow * npcol) || (iam == -1) ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) */ set_default_options_dist(&options); #if 0 - options.RowPerm = LargeDiag_HWPM; + options.RowPerm = NOROWPERM; options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; @@ -224,10 +224,16 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -240,9 +246,7 @@ int main(int argc, char *argv[]) zScalePermstructFree(&ScalePermstruct); zDestroy_LU(n, &grid, &LUstruct); zLUstructFree(&LUstruct); - if ( options.SolveInitialized ) { - zSolveFinalize(&options, &SOLVEstruct); - } + zSolveFinalize(&options, &SOLVEstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c index 6718cd0e..e0f8ff4b 100644 --- a/EXAMPLE/pzdrive1.c +++ b/EXAMPLE/pzdrive1.c @@ -13,10 +13,11 @@ at the top-level directory. * \brief Driver program for PZGSSVX example * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * 
*/ @@ -32,7 +33,8 @@ at the top-level directory. * The driver program PZDRIVE1. * * This example illustrates how to use PZGSSVX to - * solve systems with the same A but different right-hand side. + * solve systems with the same A but different right-hand side, + * possibly with different number of right-hand sides. * In this case, we factorize A only once in the first call to * PZGSSVX, and reuse the following data structures * in the subsequent call to PZGSSVX: @@ -53,8 +55,8 @@ int main(int argc, char *argv[]) zSOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; - doublecomplex *b, *xtrue, *b1; - int i, j, m, n; + doublecomplex *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c, *postfix; @@ -64,7 +66,7 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ + nrhs = 3; /* Max. number of right-hand sides. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. @@ -103,7 +105,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -140,14 +142,24 @@ int main(int argc, char *argv[]) zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); - for (j = 0; j < nrhs; ++j) - for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; + if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); + m = A.nrow; + n = A.ncol; + m_loc = ((NRformat_loc *)A.Store)->m_loc; + /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. ------------------------------------------------------------*/ /* Set the default input options: @@ -170,9 +182,6 @@ int main(int argc, char *argv[]) fflush(stdout); } - m = A.nrow; - n = A.ncol; - /* Initialize ScalePermstruct and LUstruct. */ zScalePermstructInit(m, n, &ScalePermstruct); zLUstructInit(n, &LUstruct); @@ -181,41 +190,90 @@ int main(int argc, char *argv[]) PStatInit(&stat); /* Call the linear equation solver. */ + nrhs = 1; pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ + nrhs = 1; pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + pzgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + pzgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ - PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); zScalePermstructFree(&ScalePermstruct); zDestroy_LU(n, &grid, &LUstruct); @@ -225,6 +283,7 @@ int main(int argc, char *argv[]) } SUPERLU_FREE(b); SUPERLU_FREE(b1); + SUPERLU_FREE(b2); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); fclose(fp); diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c index d2a048d4..bf47169f 100644 --- a/EXAMPLE/pzdrive1_ABglobal.c +++ b/EXAMPLE/pzdrive1_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -104,8 +109,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c index df9b2263..ac042c2b 100644 --- a/EXAMPLE/pzdrive2.c +++ b/EXAMPLE/pzdrive2.c @@ -32,8 +32,8 @@ at the top-level directory. * * The driver program PZDRIVE2. * - * This example illustrates how to use to solve - * systems repeatedly with the same sparsity pattern of matrix A. + * This example illustrates how to use PZGSSVX to solve systems + * repeatedly with the same sparsity pattern of matrix A. * In this case, the column permutation vector ScalePermstruct->perm_c is * computed once. The following data structures will be reused in the * subsequent call to PZGSSVX: @@ -115,7 +115,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -146,7 +146,8 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. ------------------------------------------------------------*/ zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - + fclose(fp); + if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); m = A.nrow; @@ -155,7 +156,7 @@ int main(int argc, char *argv[]) m_loc = Astore->m_loc; /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: @@ -189,20 +190,27 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ zDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with - the L and U matrices. */ - SUPERLU_FREE(b); /* Free storage of right-hand side. */ - SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER LINEAR SYSTEM. - ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. ------------------------------------------------------------*/ options.Fact = SamePattern; @@ -217,18 +225,25 @@ int main(int argc, char *argv[]) /* Get the matrix from file, perturbed some diagonal entries to force a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); - zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid); - + zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, postfix, &grid); + PStatInit(&stat); /* Initialize the statistics variables. */ /* Solve the linear system. */ pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } #if ( PRNTlevel>=2 ) if (iam==0) { PrintInt10("new perm_r", m, ScalePermstruct.perm_r); diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c index b6c00cbb..96866c35 100644 --- a/EXAMPLE/pzdrive2_ABglobal.c +++ b/EXAMPLE/pzdrive2_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -104,8 +109,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c index c00415d6..df7a7479 100644 --- a/EXAMPLE/pzdrive3.c +++ b/EXAMPLE/pzdrive3.c @@ -34,9 +34,9 @@ at the top-level directory. * This example illustrates how to use PZGSSVX to solve * systems repeatedly with the same sparsity pattern and similar * numerical values of matrix A. - * In this case, the column permutation vector and symbolic factorization are - * computed only once. The following data structures will be reused in the - * subsequent call to PZGSSVX: + * In this case, the row and column permutation vectors and symbolic + * factorization are computed only once. The following data structures + * will be reused in the subsequent call to PZGSSVX: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -205,8 +205,15 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -230,8 +237,9 @@ int main(int argc, char *argv[]) nzval1[0].r += 1.0e-8; nzval1[0].i += 1.0e-8; } - /* Zero the numerical values in L. */ + /* Zero the numerical values in L and U. */ zZeroLblocks(iam, n, &grid, &LUstruct); + zZeroUblocks(iam, n, &grid, &LUstruct); zCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, nzval1, colind1, rowptr1, @@ -241,16 +249,23 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) - printf("Solve a system with the same pattern and similar values.\n"); - pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ - DEALLOCATE STORAGE. + DEALLOCATE ALL STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c index e1f21ce1..144e3f6e 100644 --- a/EXAMPLE/pzdrive3_ABglobal.c +++ b/EXAMPLE/pzdrive3_ABglobal.c @@ -77,7 +77,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -110,8 +115,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c new file mode 100644 index 00000000..1d35fb24 --- /dev/null +++ b/EXAMPLE/pzdrive3d.c @@ -0,0 +1,420 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D.
+ *
+ * This example illustrates how to use PZGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pzgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

pzdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c new file mode 100644 index 00000000..47905c56 --- /dev/null +++ b/EXAMPLE/pzdrive3d1.c @@ -0,0 +1,448 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D1.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PZGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PZGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d1 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive3d2.c b/EXAMPLE/pzdrive3d2.c new file mode 100644 index 00000000..759b2afb --- /dev/null +++ b/EXAMPLE/pzdrive3d2.c @@ -0,0 +1,424 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D2.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PZGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int zcreate_matrix_perturbed + (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, + FILE *, gridinfo_t *); + extern int zcreate_matrix_perturbed_postfix + (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + zDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive3d3.c b/EXAMPLE/pzdrive3d3.c new file mode 100644 index 00000000..9eab4a11 --- /dev/null +++ b/EXAMPLE/pzdrive3d3.c @@ -0,0 +1,430 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D3.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PZGSSVX3D:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PZGSSVX3D, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + zZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + zZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c index 3b768f81..1b95ded7 100644 --- a/EXAMPLE/pzdrive4.c +++ b/EXAMPLE/pzdrive4.c @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n; int nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ int ii, omp_mpi_level; @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); @@ -190,9 +190,16 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid1); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); @@ -255,10 +262,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid2); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c index 4ec17583..9b3ff81b 100644 --- a/EXAMPLE/pzdrive4_ABglobal.c +++ b/EXAMPLE/pzdrive4_ABglobal.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ char trans[1]; @@ -70,6 +70,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); @@ -125,7 +131,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c index 5871f7b8..b6f48554 100644 --- a/EXAMPLE/pzdrive_ABglobal.c +++ b/EXAMPLE/pzdrive_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -105,9 +110,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) - goto out; - + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index faf725c6..8dab3751 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) @@ -151,7 +156,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol ) goto out; + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -247,7 +252,7 @@ int main(int argc, char *argv[]) /* Check the accuracy of the solution. */ pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -266,7 +271,7 @@ int main(int argc, char *argv[]) // result[1] = total * 1e-6; if (!iam) { printf("returning data:\n" - " Factor time : %8.2f | Total MEM : %8.2f\n", + " Factor time : %8.2f\n Total MEM : %8.2f\n", stat.utime[FACT], total * 1e-6); printf(" Solve time : %8.2f \n", stat.utime[SOLVE]); @@ -305,6 +310,7 @@ int main(int argc, char *argv[]) RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: + if(parent!=MPI_COMM_NULL) MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent); superlu_gridexit(&grid); @@ -312,7 +318,7 @@ int main(int argc, char *argv[]) TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ - + if(parent!=MPI_COMM_NULL) MPI_Comm_disconnect(&parent); MPI_Finalize(); diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c index 8efa3285..153dca2c 100644 --- a/EXAMPLE/pzutil.c +++ b/EXAMPLE/pzutil.c @@ -1,15 +1,15 @@ /*! \file Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) -All rights reserved. +All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ -/*! @file +/*! @file * \brief Several matrix utilities * *
@@ -38,15 +38,15 @@ int pzCompRow_loc_to_CompCol_global
     int_t *colind, *rowptr;
     int_t *colptr_loc, *rowind_loc;
     int_t m_loc, n, i, j, k, l;
-    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
+    int_t colnnz, fst_row, nnz_loc, nnz;
     doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
     doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
-    int_t *colcnt, *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
 			   local block rows.
 			   Use n_loc+1 pointers for each block. */
     int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
+			   redistribution to the local block columns.
 			   Use n_loc+1 pointers for each block. */
     int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
     int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
@@ -164,7 +164,7 @@ int pzCompRow_loc_to_CompCol_global
                       a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                       grid->comm);
     }
-      
+
     /* Reset colptr_loc[] to point to the n_loc global columns. */
     colptr_loc[0] = 0;
     itemp = colptr_send;
@@ -178,7 +178,7 @@ int pzCompRow_loc_to_CompCol_global
 	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
     }
     itemp[n_loc] = colptr_loc[n_loc];
-      
+
     /* Merge blocks of row indices into columns of row indices. */
     for (i = 0; i < procs; ++i) {
         k = i * (n_loc + 1);
@@ -219,12 +219,12 @@ int pzCompRow_loc_to_CompCol_global
     MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
     for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
     GAstore->nnz = nnz;
-    
+
     if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
     if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
+
     /* Allgatherv for row indices. */
     rdispls[0] = 0;
     for (i = 0; i < procs-1; ++i) {
@@ -233,12 +233,12 @@ int pzCompRow_loc_to_CompCol_global
     }
     itemp_32[procs-1] = itemp[procs-1];
     it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
     if ( need_value ) {
       if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
           ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval,
 		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
     } else GAstore->nzval = NULL;
 
@@ -249,7 +249,7 @@ int pzCompRow_loc_to_CompCol_global
         itemp_32[i] = n_locs[i];
     }
     itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
 
     /* Recompute column pointers. */
@@ -277,7 +277,7 @@ int pzCompRow_loc_to_CompCol_global
     SUPERLU_FREE(rowind_recv);
     if ( need_value) SUPERLU_FREE(a_recv);
 #if ( DEBUGlevel>=1 )
-    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
     CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
 #endif
     return 0;
@@ -371,7 +371,7 @@ int pzPermute_Dense_Matrix
 	++ptr_to_ibuf[p];
 	ptr_to_dbuf[p] += nrhs;
     }
-	  
+
     /* Transfer the (permuted) row indices and numerical values. */
     MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
 		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
@@ -397,12 +397,299 @@ int pzPermute_Dense_Matrix
 } /* pzPermute_Dense_Matrix */
 
 
+/*! \brief Allocate storage in LUstruct */
+void zLUstructInit(const int_t n, zLUstruct_t *LUstruct)
+{
+    if ( !(LUstruct->etree = intMalloc_dist(n)) )
+	ABORT("Malloc fails for etree[].");
+    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
+	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
+	ABORT("Malloc fails for Glu_persist_t.");
+    if ( !(LUstruct->Llu = (zLocalLU_t *)
+	   SUPERLU_MALLOC(sizeof(zLocalLU_t))) )
+	ABORT("Malloc fails for LocalLU_t.");
+	LUstruct->Llu->inv = 0;
+}
+
+/*! \brief Deallocate LUstruct */
+void zLUstructFree(zLUstruct_t *LUstruct)
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zLUstructFree()");
+#endif
+
+    SUPERLU_FREE(LUstruct->etree);
+    SUPERLU_FREE(LUstruct->Glu_persist);
+    SUPERLU_FREE(LUstruct->Llu);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zLUstructFree()");
+#endif
+}
+
+/*! \brief Destroy distributed L & U matrices. */
+void
+zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zDestroy_LU()");
+#endif
+
+    zDestroy_Tree(n, grid, LUstruct);
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc    
+    //#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#endif
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nb = CEILING(nsupers, grid->nprow);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    SUPERLU_FREE(Llu->ToSendR[0]);
+    SUPERLU_FREE(Llu->ToSendR);
+
+    /* The following can be freed only after iterative refinement. */
+    SUPERLU_FREE(Llu->ilsum);
+    SUPERLU_FREE(Llu->fmod);
+    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE(Llu->fsendx_plist);
+    SUPERLU_FREE(Llu->bmod);
+    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE(Llu->bsendx_plist);
+    SUPERLU_FREE(Llu->mod_bit);
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+	}	
+    SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0; iLinv_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+	}
+	if(Llu->Uinv_bc_ptr[i]!=NULL){
+	    SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+	}	
+    }
+    SUPERLU_FREE(Llu->Linv_bc_ptr);
+    SUPERLU_FREE(Llu->Uinv_bc_ptr);
+    SUPERLU_FREE(Llu->Unnz);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Urbs[i] ) {
+	    SUPERLU_FREE(Llu->Ucb_indptr[i]);
+	    SUPERLU_FREE(Llu->Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Llu->Ucb_indptr);
+    SUPERLU_FREE(Llu->Ucb_valptr);	
+    SUPERLU_FREE(Llu->Urbs);
+
+    SUPERLU_FREE(Glu_persist->xsup);
+    SUPERLU_FREE(Glu_persist->supno);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zDestroy_LU()");
+#endif
+}
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * 
+ */ +int_t +pzgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, + int_t perm_r[], int_t perm_c[], gridinfo_t *grid, + Glu_persist_t *Glu_persist, zSOLVEstruct_t *SOLVEstruct) +{ + + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; + int_t *row_to_proc; + int_t i, gbi, k, l, num_diag_procs, *diag_procs; + int_t irow, q, knsupc, nsupers, *xsup, *supno; + int iam, p, pkk, procs; + pxgstrs_comm_t *gstrs_comm; + + procs = grid->nprow * grid->npcol; + iam = grid->iam; + gstrs_comm = SOLVEstruct->gstrs_comm; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + row_to_proc = SOLVEstruct->row_to_proc; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. + ------------------------------------------------------------*/ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for B_to_X_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of elements to be sent to each diagonal process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + ++SendCnt[p]; + } + + /* Set up the displacements for alltoall. */ + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + } + for (p = 0; p < procs; ++p) { + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + sdispls_nrhs[p] = sdispls[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->B_to_X_SendCnt = SendCnt; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. + ------------------------------------------------------------*/ + /* This is freed in pxgstrs_finalize(). */ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for X_to_B_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of X entries to be sent to each process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + irow = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) { +#if 0 + q = row_to_proc[inv_perm_c[irow]]; +#else + q = row_to_proc[irow]; +#endif + ++SendCnt[q]; + ++irow; + } + } + } + } + + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + SendCnt_nrhs[0] = SendCnt[0] * nrhs; + RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + sdispls_nrhs[p] = sdispls[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->X_to_B_SendCnt = SendCnt; + + if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for ptr_to_ibuf[]."); + gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; + gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; + + return 0; +} /* PZGSTRS_INIT */ + + /*! \brief Initialize the data structure for the solution phase. */ -int zSolveInit(superlu_options_t *options, SuperMatrix *A, +int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, int_t perm_r[], int_t perm_c[], int_t nrhs, - LUstruct_t *LUstruct, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) + zLUstruct_t *LUstruct, gridinfo_t *grid, + zSOLVEstruct_t *SOLVEstruct) { int_t *row_to_proc, *inv_perm_c, *itemp; NRformat_loc *Astore; @@ -413,29 +700,21 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, fst_row = Astore->fst_row; m_loc = Astore->m_loc; procs = grid->nprow * grid->npcol; - - if ( !grid->iam ) printf("@@@ enter zSolveInit, A->nrow %d\n", A->nrow); if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) ABORT("Malloc fails for row_to_proc[]"); - if ( !grid->iam ) { printf("@@@ malloc(1) zSolveInit\n"); fflush(stdout); } SOLVEstruct->row_to_proc = row_to_proc; - if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); - if ( !grid->iam ) { printf("@@@ malloc(2) zSolveInit\n"); fflush(stdout); } - for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; SOLVEstruct->inv_perm_c = inv_perm_c; - if ( !grid->iam ) printf("@@@ after malloc zSolveInit\n"); - /* ------------------------------------------------------------ EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. - + NOTE: For those processes that do not own any row, it must - must be set so that fst_row == A->nrow. + must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ if ( !(itemp = intMalloc_dist(procs+1)) ) ABORT("Malloc fails for itemp[]"); @@ -445,11 +724,6 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, for (p = 0; p < procs; ++p) { for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; } - - if ( !grid->iam ) printf("@@@ after allgather zSolveInit\n"); - -#define DEBUGlevel 2 - #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("fst_row = %d\n", fst_row); @@ -475,34 +749,35 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, for (i = j ; i < k; ++i) row_to_proc[i] = p; } } -#endif +#endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, &SOLVEstruct->num_diag_procs, &SOLVEstruct->diag_procs, &SOLVEstruct->diag_len); + /* Setup communication pattern for redistribution of B and X. */ if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + pzgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *) SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; - + options->SolveInitialized = YES; return 0; } /* zSolveInit */ /*! \brief Release the resources used for the solution phase. */ -void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) +void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct) { - int_t *it; pxgstrs_finalize(SOLVEstruct->gstrs_comm); + if ( options->RefineInitialized ) { pzgsmv_finalize(SOLVEstruct->gsmv_comm); options->RefineInitialized = NO; @@ -512,19 +787,19 @@ void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); + if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); options->SolveInitialized = NO; } /* zSolveFinalize */ -/*! \brief Check the inf-norm of the error vector +/*! \brief Check the inf-norm of the error vector */ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx, - doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) + doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; doublecomplex *x_work, *xtrue_work; doublecomplex temp; - int i, j; + int i, j, ii; for (j = 0; j < nrhs; j++) { x_work = &x[j*ldx]; @@ -534,16 +809,83 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx z_sub(&temp, &x_work[i], &xtrue_work[i]); err = SUPERLU_MAX(err, slud_z_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i])); +#if 1 + if (err > 1.e-4 && iam == 1) { + ii = i; + printf("(wrong proc %d) wrong index ii %d\n", iam, ii); + PrintDoublecomplex("x_work(ii)", 5, &x[ii]); + PrintDoublecomplex("x_true(ii)", 5, &xtrue_work[ii]); + fflush(stdout); + break; + } +#endif } +#if 0 + printf("\t(%d) loc n %d: err = %e\txnorm = %e\n", iam, n, err, xnorm); + if (iam == 4) { + printf("ii %d\n", ii); + PrintDoublecomplex("x_work", 5, x); + PrintDoublecomplex("x_true", 5, xtrue_work); + } + fflush(stdout); +#endif /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + if ( !iam ) { + printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + fflush(stdout); + } + } +} + +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ +void +zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter Destroy_Tree()"); +#endif + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); + } } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit zDestroy_Tree()"); +#endif } + diff --git a/EXAMPLE/screate_matrix.c b/EXAMPLE/screate_matrix.c new file mode 100644 index 00000000..2fe905bf --- /dev/null +++ b/EXAMPLE/screate_matrix.c @@ -0,0 +1,427 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} + + + +int screate_matrix_postfix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char * postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/screate_matrix3d.c b/EXAMPLE/screate_matrix3d.c new file mode 100644 index 00000000..5cb123d0 --- /dev/null +++ b/EXAMPLE/screate_matrix3d.c @@ -0,0 +1,463 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix3d(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int screate_matrix_postfix3d(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix_postfix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if (!strcmp(postfix, "rua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/screate_matrix_perturbed.c b/EXAMPLE/screate_matrix_perturbed.c new file mode 100644 index 00000000..0f1d6625 --- /dev/null +++ b/EXAMPLE/screate_matrix_perturbed.c @@ -0,0 +1,419 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * December 31, 2016
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX_PERTURBED read the matrix from data file in
+ * Harwell-Boeing format, and distribute it to processors in a distributed
+ * compressed row format. It also generate the distributed true solution X
+ * and the right-hand side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix_perturbed(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0] *= 0.01; + nzval[nnz-1] *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} + + + +int screate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char *postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0] *= 0.01; + nzval[nnz-1] *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/sp_ienv.c b/EXAMPLE/sp_ienv.c index 195cf8c2..651a1be7 100644 --- a/EXAMPLE/sp_ienv.c +++ b/EXAMPLE/sp_ienv.c @@ -67,8 +67,8 @@ at the top-level directory. #include -int_t -sp_ienv_dist(int_t ispec) +int +sp_ienv_dist(int ispec) { // printf(" this function called\n"); int i; diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c new file mode 100644 index 00000000..18a5a4e8 --- /dev/null +++ b/EXAMPLE/zcreate_matrix3d.c @@ -0,0 +1,462 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * 
+ */ +#include +#include "superlu_zdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter zcreate_matrix_postfix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"cua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 16c9c615..4390f0b7 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -1,41 +1,102 @@ -# Sherry; may not need it? +# include the paths for header files include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC) +include_directories(${SuperLU_DIST_BINARY_DIR}/FORTRAN) -# Fortran stuff +set(sources "superlu_c2f_wrap.c") # initialize precision-independent file + +if(enable_double) + list(APPEND sources c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c) +endif() +if(enable_complex16) + list(APPEND sources c2f_zcreate_matrix_x_b.c superlu_c2f_zwrap.c) +endif() + +add_library(superlu_dist_fortran ${sources}) +set(targets superlu_dist_fortran) +get_target_property(superlu_dist_version superlu_dist VERSION) +get_target_property(superlu_dist_soversion superlu_dist SOVERSION) +set_target_properties(superlu_dist_fortran PROPERTIES VERSION ${superlu_dist_version}) +set_target_properties(superlu_dist_fortran PROPERTIES SOVERSION ${superlu_dist_soversion}) +target_link_libraries(superlu_dist_fortran superlu_dist) + +# depends on FPP defs +add_dependencies(superlu_dist_fortran config_f) + +install(TARGETS superlu_dist_fortran +# DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION "${INSTALL_BIN_DIR}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" +) + +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING PATTERN *.mod + ) +install(FILES superlu_dist_config.fh + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) + +# Fortran MPI stuff add_definitions(${MPI_Fortran_COMPILE_FLAGS}) include_directories(${MPI_Fortran_INCLUDE_PATH}) link_directories(${MPI_Fortran_LIBRARIES}) -# Libs linked to all of the examples -set(all_link_libs superlu_dist ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +# Libs to be linked with the Fortran codes +set(fortran_link_libs superlu_dist_fortran ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +set(all_link_libs ${fortran_link_libs} superlu_dist) + #message("!!! in Fortran: MPI_Fortran_LINK_FLAGS='${MPI_Fortran_LINK_FLAGS}'") #message("!!! in Fortran: all_link_libs='${all_link_libs}'") -message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") +#message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") if (NOT MSVC) list(APPEND all_link_libs m) endif () -set(F_MOD superlupara.f90 superlu_mod.f90) -if(enable_double) - set(C_DWRAP dcreate_dist_matrix.c superlu_c2f_dwrap.c) - set(F_DEXM ${F_MOD} dhbcode1.f90 f_pddrive.f90 ${C_DWRAP}) +add_library(ftestmod STATIC superlupara.f90 superlu_mod.f90) + +#if(enable_double) +if(FALSE) + set(F_DEXM ${F_MOD} f_pddrive.F90) add_executable(f_pddrive ${F_DEXM}) - target_link_libraries(f_pddrive ${all_link_libs}) + target_link_libraries(f_pddrive ftestmod ${all_link_libs}) + # set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c ${C_DWRAP}) + set(F_DEXM3D f_pddrive3d.F90) + add_executable(f_pddrive3d ${F_DEXM3D}) + target_link_libraries(f_pddrive3d ftestmod ${all_link_libs}) + set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") + + set(F_5x5 f_5x5.F90 sp_ienv.c) add_executable(f_5x5 ${F_5x5}) - target_link_libraries(f_5x5 ${all_link_libs}) -# set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE Fortran) + target_link_libraries(f_5x5 ftestmod ${all_link_libs}) set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") + endif() -if(enable_complex16) - set(C_ZWRAP zcreate_dist_matrix.c superlu_c2f_zwrap.c) - set(F_ZEXM ${F_MOD} zhbcode1.f90 f_pzdrive.f90 ${C_ZWRAP}) +# if(enable_complex16) +if(FALSE) + set(F_ZEXM f_pzdrive.F90) add_executable(f_pzdrive ${F_ZEXM}) - target_link_libraries(f_pzdrive ${all_link_libs}) + target_link_libraries(f_pzdrive ftestmod ${all_link_libs}) # set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran) set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") + + set(F_ZEXM3D f_pzdrive3d.F90) + add_executable(f_pzdrive3d ${F_ZEXM3D}) + target_link_libraries(f_pzdrive3d ftestmod ${all_link_libs}) + set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") + endif() + + +# Format superlu_dist_config.fh from superlu_dist_config.h in C +add_custom_command( + OUTPUT superlu_dist_config.fh + COMMAND sed;'/^\\//;d';<;superlu_dist_config.h;>;superlu_dist_config.fh + COMMAND cp;superlu_dist_config.fh;${SuperLU_DIST_SOURCE_DIR}/FORTRAN/. + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/FORTRAN +) +add_custom_target(config_f DEPENDS superlu_dist_config.fh) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index d275c360..008e3431 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -8,29 +8,38 @@ # ####################################################################### .SUFFIXES: -.SUFFIXES: .f90 .c .o +.SUFFIXES: .f90 .F90 .c .o include ../make.inc #F90FLAGS = $(FFLAGS) -qfree -qsuffix=f=f90 -qflag=w:w F_MOD = superlupara.o superlu_mod.o -C_DWRAP = dcreate_dist_matrix.o superlu_c2f_dwrap.o -C_ZWRAP = zcreate_dist_matrix.o superlu_c2f_zwrap.o +#C_DWRAP = c2f_dcreate_matrix_x_b.o superlu_c2f_dwrap.o +#C_ZWRAP = c2f_zcreate_matrix_x_b.o superlu_c2f_zwrap.o -F_DEXM = $(F_MOD) dhbcode1.o f_pddrive.o -F_ZEXM = $(F_MOD) zhbcode1.o f_pzdrive.o -F_5x5 = $(F_MOD) f_5x5.o sp_ienv.o +F_DEXM = $(F_MOD) f_pddrive.o +F_DEXM3D= $(F_MOD) f_pddrive3d.o +F_ZEXM = $(F_MOD) f_pzdrive.o +F_ZEXM3D= $(F_MOD) f_pzdrive3d.o +F_5x5 = $(F_MOD) f_5x5.o sp_ienv.o -all: f_pddrive f_pzdrive f_5x5 -f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_5x5) $(C_DWRAP) $(LIBS) -o $@ +all: f_pddrive f_pddrive3d f_pzdrive f_pzdrive3d f_5x5 -f_pddrive: $(F_DEXM) $(C_DWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_DEXM) $(C_DWRAP) $(LIBS) -o $@ +f_5x5: $(F_5x5) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_5x5) $(LIBS) -o $@ -f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_ZEXM) $(C_ZWRAP) $(LIBS) -o $@ +f_pddrive: $(F_DEXM) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_DEXM) $(LIBS) -o $@ + +f_pddrive3d: $(F_DEXM3D) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_DEXM3D) $(LIBS) -o $@ + +f_pzdrive: $(F_ZEXM) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_ZEXM) $(LIBS) -o $@ + +f_pzdrive3d: $(F_ZEXM3D) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_ZEXM3D) $(LIBS) -o $@ .c.o: $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) @@ -38,6 +47,9 @@ f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB) .f90.o: $(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE) +.F90.o: + $(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE) + .f.o: $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) diff --git a/FORTRAN/README b/FORTRAN/README index 95187ab8..ce7da07d 100644 --- a/FORTRAN/README +++ b/FORTRAN/README @@ -1,4 +1,5 @@ Fortran 90 Interface + ==================== This directory contains Fortran-90 wrapper routines for SuperLU_DIST. The directory contains the following files: @@ -13,7 +14,7 @@ To compile the code, type 'make' There are two examples in the directory. -1. f_5x5.f90: +1. f_5x5.f90 A small 5x5 example appeared in the SuperLU Users Guide, Section 2.2. To run the code, type: mpiexec -n 2 f_5x5 @@ -24,11 +25,25 @@ There are two examples in the directory. 'g20.rua' in Harwell-Boeing format. To run the code, type: mpiexec -n 4 f_pddrive - (The example is set up to use 4 processors.) + (The example is set up to use 4 MPI processes) + +2. f_pddrive3d.f90: use the 3D algorithms + A real example Fortran driver routine that reads a matrix from a file + 'g20.rua' in Harwell-Boeing format. + To run the code, type: + mpiexec -n 8 f_pddrive3d + (The example is set up to use 8 MPI processes) 3. f_pzdrive.f90 A complex example Fortran driver routine that reads a matrix from a file 'cg20.cua' in Harwell-Boeing format. To run the code, type: mpiexec -n 4 f_pzdrive - (The example is set up to use 4 processors.) + (The example is set up to use 4 MPI processes) + +3. f_pzdrive3d.f90: use the 3D algorihms + A complex example Fortran driver routine that reads a matrix from a file + 'cg20.cua' in Harwell-Boeing format. + To run the code, type: + mpiexec -n 8 f_pzdrive3d + (The example is set up to use 8 MPI processes) diff --git a/FORTRAN/c2f_dcreate_matrix_x_b.c b/FORTRAN/c2f_dcreate_matrix_x_b.c new file mode 100644 index 00000000..cd9b970f --- /dev/null +++ b/FORTRAN/c2f_dcreate_matrix_x_b.c @@ -0,0 +1,281 @@ + + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Read the matrix from data file, then distribute it in a + * distributed CSR format. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * 
+ */ +#include +#include "superlu_ddefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * C2F_DCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * fname (input) char*
+ *       File name as a character string.
+ *
+ * nrhs  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * 
+ */ + +int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs, + MPI_Comm slucomm, SuperMatrix *A, + int *m_g, int *n_g, int_t *nnz_g, + double *rhs, int *ldb, double *x, int *ldx) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t *marker; + int_t nnz, nnz_loc, m, n; + int m_loc, fst_row; + int m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int row, col, i, j, relpos; + int iam; + char trans[1]; + + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + + MPI_Comm_rank(slucomm, &iam); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter c2f_dreate_matrix_x_b()"); +#endif + + if ( iam==0 ) { + double t = SuperLU_timer_(); + + if ( !(fp = fopen(fname, "r")) ) { + ABORT("File does not exist"); + } + for (i = 0; i < strlen(fname); i++) { + if (fname[i]=='.') { + postfix = &(fname[i+1]); + } + } + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + fclose(fp); + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / nprocs; //(grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * nprocs) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (nprocs - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (nprocs - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + rhs[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + *ldx = m_loc; + + /* Set the true X */ + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit c2f_dreate_matrix_x_b()"); +#endif + + *m_g = m; + *n_g = n; + *nnz_g = nnz; + return 0; +} diff --git a/FORTRAN/c2f_zcreate_matrix_x_b.c b/FORTRAN/c2f_zcreate_matrix_x_b.c new file mode 100644 index 00000000..df54a458 --- /dev/null +++ b/FORTRAN/c2f_zcreate_matrix_x_b.c @@ -0,0 +1,280 @@ + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Read the matrix from data file, then distribute it in a + * distributed CSR format. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * 
+ */ +#include +#include "superlu_zdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * C2F_ZCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * fname (input) char*
+ *       File name as a character string.
+ *
+ * nrhs  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * 
+ */ + +int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs, + MPI_Comm slucomm, SuperMatrix *A, + int *m_g, int *n_g, int_t *nnz_g, + doublecomplex *rhs, int *ldb, doublecomplex *x, int *ldx) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t *marker; + int_t nnz, nnz_loc, m, n; + int m_loc, fst_row; + int m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int row, col, i, j, relpos; + int iam; + char trans[1]; + + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + + MPI_Comm_rank(slucomm, &iam); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter c2f_zreate_matrix_x_b()"); +#endif + + if ( iam==0 ) { + double t = SuperLU_timer_(); + + if ( !(fp = fopen(fname, "r")) ) { + ABORT("File does not exist"); + } + for (i = 0; i < strlen(fname); i++) { + if (fname[i]=='.') { + postfix = &(fname[i+1]); + } + } + if(!strcmp(postfix,"cua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + fclose(fp); + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / nprocs; //(grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * nprocs) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (nprocs - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (nprocs - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + rhs[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + *ldx = m_loc; + + /* Set the true X */ + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit c2f_zreate_matrix_x_b()"); +#endif + + *m_g = m; + *n_g = n; + *nnz_g = nnz; + return 0; +} diff --git a/FORTRAN/dcreate_dist_matrix.c b/FORTRAN/dcreate_dist_matrix.c index 3a6dde99..bf451cb9 100644 --- a/FORTRAN/dcreate_dist_matrix.c +++ b/FORTRAN/dcreate_dist_matrix.c @@ -118,6 +118,9 @@ int dcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz, MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); } + if (iam==0) {printf("after broadcast: m %d, nnz %d\n", m,nnz); fflush(stdout);} + exit(-1); + #if 0 nzval[0]=0.1; #endif diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.F90 similarity index 85% rename from FORTRAN/f_5x5.f90 rename to FORTRAN/f_5x5.F90 index fec77adc..7585da3a 100644 --- a/FORTRAN/f_5x5.f90 +++ b/FORTRAN/f_5x5.F90 @@ -33,15 +33,19 @@ program f_5x5 ! 6. Release the process grid and terminate the MPI environment ! 7. Release all structures ! +#include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 ) +#if (XSDK_INDEX_SIZE==64) + integer*8 colind(maxnz), rowptr(maxn+1) +#else integer colind(maxnz), rowptr(maxn+1) +#endif real*8 nzval(maxnz), b(maxn), berr(maxnrhs) - integer n, m, nnz, nrhs, ldb, nprow, npcol, init - integer*4 iam, info, i, ierr, ldb4 + integer n, m, nnz, nrhs, nprow, npcol, init + integer iam, info, i, ierr, ldb integer nnz_loc, m_loc, fst_row real*8 s, u, p, e, r, l @@ -62,9 +66,9 @@ program f_5x5 ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -81,6 +85,9 @@ program f_5x5 if ( iam == 0 ) then write(*,*) ' Process grid ', nprow, ' X ', npcol write(*,*) ' default integer size ', kind(0) +#if (XSDK_INDEX_SIZE==64) + write(*,*) ' use 64-bit integer for A matrix' +#endif endif ! !************************************************************************* @@ -166,7 +173,6 @@ program f_5x5 b(i) = 1.0 enddo nrhs = 1 - ldb4 = ldb ! Set the default input options call f_set_default_options(options) @@ -177,14 +183,14 @@ program f_5x5 ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A,nrow=m,ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) ! Call the linear equation solver - call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, & + call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, & grid, LUstruct, SOLVEstruct, berr, stat, info) if (info == 0 .and. iam == 1) then @@ -196,13 +202,13 @@ program f_5x5 ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_SuperMat_Store_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU(n, grid, LUstruct) - call f_LUstructFree(LUstruct) - call get_superlu_options(options, SolveInitialized=init) - if (init == YES) then - call f_dSolveFinalize(options, SOLVEstruct) - endif + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) +! call f_LUstructFree(LUstruct) +! call get_superlu_options(options, SolveInitialized=init) +! if (init == YES) then +! call f_dSolveFinalize(options, SOLVEstruct) +! endif ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) @@ -213,7 +219,11 @@ program f_5x5 call f_destroy_ScalePerm_handle(ScalePermstruct) call f_destroy_LUstruct_handle(LUstruct) call f_destroy_SOLVEstruct_handle(SOLVEstruct) + +! call f_Destroy_CompRowLoc_Mat_dist(A) +! need to free the supermatrix Store call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) ! Check malloc diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.F90 similarity index 67% rename from FORTRAN/f_pddrive.f90 rename to FORTRAN/f_pddrive.F90 index 33803d99..c69792af 100644 --- a/FORTRAN/f_pddrive.f90 +++ b/FORTRAN/f_pddrive.F90 @@ -29,15 +29,20 @@ program f_pddrive ! 7. Release all structures ! ! +#include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) - real*8 values(maxnz), b(maxn), berr(maxnrhs) - integer n, m, nnz, nprow, npcol, ldb, init - integer*4 iam, info, i, ierr, ldb4, nrhs + real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname integer(superlu_ptr) :: grid integer(superlu_ptr) :: options @@ -56,9 +61,9 @@ program f_pddrive ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -76,32 +81,16 @@ program f_pddrive write(*,*) ' Process grid ', nprow, ' X ', npcol endif -! Read Harwell-Boeing matrix, and adjust the pointers and indices -! to 0-based indexing, as required by C routines. +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/g20.rua'//char(0) !! make the string null-ended + call f_dcreate_matrix_x_b(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + if ( iam == 0 ) then - open(file = "../EXAMPLE/g20.rua", status = "old", unit = 5) - call dhbcode1(m, n, nnz, values, rowind, colptr) - close(unit = 5) -! - do i = 1, n+1 - colptr(i) = colptr(i) - 1 - enddo - do i = 1, nnz - rowind(i) = rowind(i) - 1 - enddo + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz endif -! Distribute the matrix to the process gird - call f_dcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid) - -! Setup the right hand side - call get_CompRowLoc_Matrix(A, nrow_loc=ldb) - do i = 1, ldb - b(i) = 1.0 - enddo - nrhs = 1 - ldb4 = ldb - ! Set the default input options call f_set_default_options(options) @@ -111,18 +100,20 @@ program f_pddrive ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) ! Call the linear equation solver - call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, & + call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, & grid, LUstruct, SOLVEstruct, berr, stat, info) if (info == 0) then - write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif else write(*,*) 'INFO from f_pdgssvx = ', info endif @@ -130,13 +121,8 @@ program f_pddrive ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU(n, grid, LUstruct) - call f_LUstructFree(LUstruct) - call get_superlu_options(options, SolveInitialized=init) - if (init == YES) then - call f_dSolveFinalize(options, SOLVEstruct) - endif + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/f_pddrive3d.F90 b/FORTRAN/f_pddrive3d.F90 new file mode 100644 index 00000000..b29eeb7d --- /dev/null +++ b/FORTRAN/f_pddrive3d.F90 @@ -0,0 +1,163 @@ + + +!> @file +! Copyright (c) 2003, The Regents of the University of California, through +! Lawrence Berkeley National Laboratory (subject to receipt of any required +! approvals from U.S. Dept. of Energy) +! +! All rights reserved. +! +! The source code is distributed under BSD license, see the file License.txt +! at the top-level directory. +! +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! May 12, 2021
+!! 
+! + program f_pddrive3d +! +! Purpose +! ======= +! +! The driver program F_PDDRIVE3D. +! +! This example illustrates how to use F_PDGSSVX3D with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pdgssvx3d +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! The program may be run by typing +! mpiexec -np 8 f_pddrive3d +! +#include "superlu_dist_config.fh" + use superlu_mod +! implicit none + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) + real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol, npdep, init + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid ! 3D process grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A ! A is on all 3D processes + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo3d_handle(grid) + call f_create_options_handle(options) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + npdep = 2 + call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam, npdep=npdep) + if ( iam >= (nprow * npcol * npdep) ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/g20.rua'//char(0) !! make the string null-ended + call f_dcreate_matrix_x_b_3d(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_pdgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) + + call f_dDestroy_A3d_gathered_on_2d(SOLVEstruct, grid) + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end diff --git a/FORTRAN/f_psdrive.F90 b/FORTRAN/f_psdrive.F90 new file mode 100644 index 00000000..e7014834 --- /dev/null +++ b/FORTRAN/f_psdrive.F90 @@ -0,0 +1,146 @@ + + +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!! 
+! + program f_psdrive +! +! Purpose +! ======= +! +! The driver program F_PDDRIVE. +! +! This example illustrates how to use F_PDGSSVX with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pdgssvx +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! + #include "superlu_dist_config.fh" + use superlu_mod + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo_handle(grid) + call f_create_options_handle(options) + call f_screate_ScalePerm_handle(ScalePermstruct) + call f_screate_LUstruct_handle(LUstruct) + call f_screate_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam) + if ( iam >= nprow * npcol ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid ', nprow, ' X ', npcol + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/g20.rua'//char(0) !! make the string null-ended + call f_screate_matrix_x_b(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_sScalePermstructInit(m, n, ScalePermstruct) + call f_sLUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_psgssvx(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_sScalePermstructFree(ScalePermstruct) + call f_sDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.F90 similarity index 67% rename from FORTRAN/f_pzdrive.f90 rename to FORTRAN/f_pzdrive.F90 index 9c9db5b0..f493f1d5 100644 --- a/FORTRAN/f_pzdrive.f90 +++ b/FORTRAN/f_pzdrive.F90 @@ -28,15 +28,21 @@ program f_pzdrive ! 7. Release all structures ! ! +#include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) - double complex values(maxnz), b(maxn), berr(maxnrhs) - integer n, m, nnz, nprow, npcol, ldb, init - integer*4 iam, info, i, ierr, ldb4, nrhs + double complex values(maxnz), b(maxn), xtrue(maxn) + real*8 berr(maxnrhs) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname integer(superlu_ptr) :: grid integer(superlu_ptr) :: options @@ -55,9 +61,9 @@ program f_pzdrive ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_zcreate_ScalePerm_handle(ScalePermstruct) + call f_zcreate_LUstruct_handle(LUstruct) + call f_zcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -75,32 +81,16 @@ program f_pzdrive write(*,*) ' Process grid ', nprow, ' X ', npcol endif -! Read Harwell-Boeing matrix, and adjust the pointers and indices -! to 0-based indexing, as required by C routines. +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/cg20.cua'//char(0) !! make the string null-ended + call f_zcreate_matrix_x_b(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + if ( iam == 0 ) then - open(file = "../EXAMPLE/cg20.cua", status = "old", unit = 5) - call zhbcode1(m, n, nnz, values, rowind, colptr) - close(unit = 5) -! - do i = 1, n+1 - colptr(i) = colptr(i) - 1 - enddo - do i = 1, nnz - rowind(i) = rowind(i) - 1 - enddo + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz endif -! Distribute the matrix to the process gird - call f_zcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid) - -! Setup the right hand side - call get_CompRowLoc_Matrix(A, nrow_loc=ldb) - do i = 1, ldb - b(i) = 1.0 - enddo - nrhs = 1 - ldb4 = ldb - ! Set the default input options call f_set_default_options(options) @@ -110,18 +100,20 @@ program f_pzdrive ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_zScalePermstructInit(m, n, ScalePermstruct) + call f_zLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) ! Call the linear equation solver - call f_pzgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, & + call f_pzgssvx(options, A, ScalePermstruct, b, ldb, nrhs, & grid, LUstruct, SOLVEstruct, berr, stat, info) if (info == 0) then - write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif else write(*,*) 'INFO from f_pdgssvx = ', info endif @@ -129,13 +121,8 @@ program f_pzdrive ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU(n, grid, LUstruct) - call f_LUstructFree(LUstruct) - call get_superlu_options(options, SolveInitialized=init) - if (init == YES) then - call f_zSolveFinalize(options, SOLVEstruct) - endif + call f_zScalePermstructFree(ScalePermstruct) + call f_zDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/f_pzdrive3d.F90 b/FORTRAN/f_pzdrive3d.F90 new file mode 100644 index 00000000..a299d7b3 --- /dev/null +++ b/FORTRAN/f_pzdrive3d.F90 @@ -0,0 +1,163 @@ + +!> @file +! Copyright (c) 2003, The Regents of the University of California, through +! Lawrence Berkeley National Laboratory (subject to receipt of any required +! approvals from U.S. Dept. of Energy) +! +! All rights reserved. +! +! The source code is distributed under BSD license, see the file License.txt +! at the top-level directory. +! +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! May 12, 2021
+!! 
+! + program f_pzdrive3d +! +! Purpose +! ======= +! +! The driver program F_PZDRIVE3D. +! +! This example illustrates how to use F_PZGSSVX3D with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pzgssvx3d +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! The program may be run by typing +! mpiexec -np 8 f_pzdrive3d +! +#include "superlu_dist_config.fh" + use superlu_mod +! implicit none + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) + double complex values(maxnz), b(maxn), xtrue(maxn) + real*8 berr(maxnrhs) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol, npdep, init + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid ! 3D process grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A ! A is on all 3D processes + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo3d_handle(grid) + call f_create_options_handle(options) + call f_zcreate_ScalePerm_handle(ScalePermstruct) + call f_zcreate_LUstruct_handle(LUstruct) + call f_zcreate_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + npdep = 2 + call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam, npdep=npdep) + if ( iam >= (nprow * npcol * npdep) ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/cg20.cua'//char(0) !! make the string null-ended + call f_zcreate_matrix_x_b_3d(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_zScalePermstructInit(m, n, ScalePermstruct) + call f_zLUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_pzgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_zScalePermstructFree(ScalePermstruct) + call f_zDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) + + call f_zDestroy_A3d_gathered_on_2d(SOLVEstruct, grid) + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c index 3366671a..e62d5c91 100644 --- a/FORTRAN/sp_ienv.c +++ b/FORTRAN/sp_ienv.c @@ -53,6 +53,8 @@ at the top-level directory. of L and U, compared with A; = 7: the minimum value of the product M*N*K for a GEMM call to be off-loaded to accelerator (e.g., GPU, Xeon Phi). + = 8: the maximum buffer size on GPU that can hold the three + matrices in the GEMM call for the Schur complement update. (SP_IENV_DIST) (output) int >= 0: the value of the parameter specified by ISPEC @@ -62,13 +64,11 @@ at the top-level directory.
*/ - #include #include - -int_t -sp_ienv_dist(int_t ispec) +int +sp_ienv_dist(int ispec) { // printf(" this function called\n"); int i; @@ -94,21 +94,27 @@ sp_ienv_dist(int_t ispec) return 1; case 3: - ttemp = getenv("NSUP"); + ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h if(ttemp) { - return(atoi(ttemp)); + int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE ); + return (k); } - else - return 128; + else return 128; #endif - case 6: return (5); + case 6: + ttemp = getenv("FILL"); + if ( ttemp ) return(atoi(ttemp)); + else return (5); case 7: ttemp = getenv ("N_GEMM"); if (ttemp) return atoi (ttemp); else return 10000; - + case 8: + ttemp = getenv ("MAX_BUFFER_SIZE"); + if (ttemp) return atoi (ttemp); + else return 64000000; // 8000^2 } /* Invalid value for ISPEC */ @@ -116,6 +122,5 @@ sp_ienv_dist(int_t ispec) xerr_dist("sp_ienv", &i); return 0; - } /* sp_ienv_dist */ diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c index da0e5de7..fb5122e8 100644 --- a/FORTRAN/superlu_c2f_dwrap.c +++ b/FORTRAN/superlu_c2f_dwrap.c @@ -4,287 +4,133 @@ * \brief C interface functions for the Fortran90 wrapper. * *
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
 #include "superlu_FCnames.h"
 
-/* kind of integer to hold a pointer.  Use int.
-   This might need to be changed on systems with large memory.
-   If changed, be sure to change it in superlupara.f90 too */
 
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
 #if 0
 typedef int fptr;  /* 32-bit */
 #else
 typedef long long int fptr;  /* 64-bit */
 #endif
 
-
-/* some MPI implementations may require conversion between a Fortran
-   communicator and a C communicator.  This routine is used to perform the
-   conversion.  It may need different forms for different MPI libraries. */
-
-/* NO_MPI2 should be defined on the compiler command line if the MPI
-   library does not provide MPI_Comm_f2c */
-
-MPI_Comm f2c_comm(int *f_comm)
-{
-#ifndef NO_MPI2
-
-/* MPI 2 provides a standard way of doing this */
-   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
-#else
-
-/* will probably need some special cases here */
-/* when in doubt, just return the input */
-   return (MPI_Comm)(*f_comm);
-#endif
-}
-
-
 /* functions that create memory for a struct and return a handle */
 
-void f_create_gridinfo_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
-}
-
-void f_create_options_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
-}
-
-void f_create_ScalePerm_handle(fptr *handle)
+void f_dcreate_ScalePerm_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dScalePermstruct_t));
 }
 
-void f_create_LUstruct_handle(fptr *handle)
+void f_dcreate_LUstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dLUstruct_t));
 }
 
-void f_create_SOLVEstruct_handle(fptr *handle)
+void f_dcreate_SOLVEstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(dSOLVEstruct_t));
 }
 
-void f_create_SuperMatrix_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
-}
-
-void f_create_SuperLUStat_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
-}
-
-/* functions that free the memory allocated by the above functions */
-
-void f_destroy_gridinfo_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_options_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_ScalePerm_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_LUstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SOLVEstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperMatrix_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperLUStat_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-/* functions that get or set values in a C struct.
-   This is not the complete set of structs for which a user might want
-   to get/set a component, and there may be missing components. */
-
-void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
-{
-  *iam=((gridinfo_t *) *grid)->iam;
-  *npcol=((gridinfo_t *) *grid)->npcol;
-  *nprow=((gridinfo_t *) *grid)->nprow;
-}
-
-void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   *nrow = ((SuperMatrix *) *A)->nrow;
-   *ncol = ((SuperMatrix *) *A)->ncol;
-}
-
-void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   ((SuperMatrix *) *A)->nrow = *nrow;
-   ((SuperMatrix *) *A)->ncol = *ncol;
-}
-
-void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  *m=((SuperMatrix *) *A)->nrow;
-  *n=((SuperMatrix *) *A)->ncol;
-  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
-  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
-  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
-}
-
-void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  ((SuperMatrix *) *A)->nrow = *m;
-  ((SuperMatrix *) *A)->ncol = *n;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
-}
-
-void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
-   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
-   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
-   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
-   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
-   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
-   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
-   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
-   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
-   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
-   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
-}
-
-void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
-    l_options->Fact = (fact_t) *Fact;
-   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
-   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
-   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
-   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
-   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
-   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
-   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
-   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
-   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
-   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
-}
-
 /* wrappers for SuperLU functions */
 
-void f_set_default_options(fptr *options)
-{
-   set_default_options_dist((superlu_dist_options_t *) *options);
-}
-
-void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
-{
-  
-   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
-{
-   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridexit(fptr *grid)
-{
-   superlu_gridexit((gridinfo_t *) *grid);
-}
-
-void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+void f_dScalePermstructInit(int *m, int *n, fptr *ScalePermstruct)
 {
    dScalePermstructInit(*m, *n, (dScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_ScalePermstructFree(fptr *ScalePermstruct)
+void f_dScalePermstructFree(fptr *ScalePermstruct)
 {
    dScalePermstructFree((dScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_PStatInit(fptr *stat)
-{
-   PStatInit((SuperLUStat_t *) *stat);
-}
-
-void f_PStatFree(fptr *stat)
-{
-   PStatFree((SuperLUStat_t *) *stat);
-}
-
-void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+void f_dLUstructInit(int *m, int *n, fptr *LUstruct)
 {
    extern void dLUstructInit(const int_t, dLUstruct_t *);
 
    dLUstructInit(*m, (dLUstruct_t *) *LUstruct);
 }
 
-void f_LUstructFree(fptr *LUstruct)
+void f_dLUstructFree(fptr *LUstruct)
 {
    extern void dLUstructFree(dLUstruct_t *);
 
    dLUstructFree((dLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_dDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    dLUstructFree((dLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+    }
 }
 
-void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-				   int_t *m_loc, int_t *fst_row, double *nzval,
-				   int_t *colind, int_t *rowptr, int *stype,
-				   int *dtype, int *mtype)
+void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
 {
-   dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
-                                  *fst_row, (double *) nzval, colind, rowptr,
-                                  (Stype_t) *stype, (Dtype_t) *dtype,
-                                  (Mtype_t) *mtype);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dLUstruct_t *LUstruct_ptr = (dLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	dDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+    	dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+    } else { // process layers not equal 0
+        dDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        dDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    dLUstructFree(LUstruct_ptr);
 }
 
-void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+void f_dDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
-   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+    dDestroy_A3d_gathered_on_2d((dSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
 }
 
-void f_Destroy_SuperMat_Store_dist(fptr *A)
+
+void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
+				   int *m_loc, int *fst_row, double *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
 {
-   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+#if 1
+    double *C_nzval = nzval;
+    int_t *C_colind = colind;
+    int_t *C_rowptr = rowptr;
+#else
+    /* make a copy of matrix A that is internal to the C side */
+    double *C_nzval = doubleMalloc_dist(*nnz_loc);
+    int_t *C_colind = intMalloc_dist(*nnz_loc);
+    int_t *C_rowptr = intMalloc_dist(*m_loc + 1);
+    int i;
+    
+    for (i = 0; i < *nnz_loc; ++i) {
+        C_nzval[i] = nzval[i];
+        C_colind[i] = colind[i];
+    }
+    for (i = 0; i <= *m_loc; ++i) {
+        C_rowptr[i] = rowptr[i];
+    }
+#endif
+
+    dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, C_nzval, C_colind, C_rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
 }
 
 void f_dSolveFinalize(fptr *options, fptr *SOLVEstruct)
@@ -307,26 +153,64 @@ void f_pdgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, double *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_dcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   double *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pdgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 double *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int dcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, double *,
-			   int_t * , int_t *, gridinfo_t *);
-
-   dcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (double *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pdgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (dScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (dLUstruct_t *) *LUstruct,
+	      (dSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
 
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
 }
 
-/* Check malloc */
+/* Create the distributed matrix */
 
-void f_check_malloc(int *iam)
-{
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC((int_t) *iam, "Check Malloc");
-#endif
+void f_dcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+    extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol);
+
+    int iam, nprocs;
+    int nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_dcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+    extern void f_get_gridinfo3d(fptr *grid, int *iam,
+                                 int *nprow, int *npcol, int *npdep);
+
+    int iam, nprocs;
+    int nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
+
diff --git a/FORTRAN/superlu_c2f_wrap.c b/FORTRAN/superlu_c2f_wrap.c
new file mode 100644
index 00000000..1c4b0e8e
--- /dev/null
+++ b/FORTRAN/superlu_c2f_wrap.c
@@ -0,0 +1,262 @@
+
+
+/*! @file 
+ * \brief C interface functions for the Fortran90 wrapper.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ * April 5, 2015
+ * May 12, 2021
+ */
+
+#include "superlu_defs.h"
+#include "superlu_FCnames.h"
+
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
+#if 0
+typedef int fptr;  /* 32-bit */
+#else
+typedef long long int fptr;  /* 64-bit */
+#endif
+
+
+/* some MPI implementations may require conversion between a Fortran
+   communicator and a C communicator.  This routine is used to perform the
+   conversion.  It may need different forms for different MPI libraries. */
+
+/* NO_MPI2 should be defined on the compiler command line if the MPI
+   library does not provide MPI_Comm_f2c */
+
+MPI_Comm f2c_comm(int *f_comm)
+{
+#ifndef NO_MPI2
+
+/* MPI 2 provides a standard way of doing this */
+   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
+#else
+
+/* will probably need some special cases here */
+/* when in doubt, just return the input */
+   return (MPI_Comm)(*f_comm);
+#endif
+}
+
+
+/* functions that create memory for a struct and return a handle */
+
+void f_create_gridinfo_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
+}
+
+void f_create_gridinfo3d_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t));
+}
+
+void f_create_options_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
+}
+
+void f_create_SuperMatrix_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
+}
+
+void f_create_SuperLUStat_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+}
+
+/* functions that free the memory allocated by the above functions */
+
+void f_destroy_gridinfo_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_options_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_ScalePerm_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_LUstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SOLVEstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperMatrix_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperLUStat_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+/* functions that get or set values in a C struct.
+   This is not the complete set of structs for which a user might want
+   to get/set a component, and there may be missing components. */
+
+void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol)
+{
+  *iam=((gridinfo_t *) *grid)->iam;
+  *npcol=((gridinfo_t *) *grid)->npcol;
+  *nprow=((gridinfo_t *) *grid)->nprow;
+}
+
+void f_get_gridinfo3d(fptr *grid, int *iam,
+         	      int *nprow, int *npcol, int *npdep)
+{
+  *iam=((gridinfo3d_t *) *grid)->iam;
+  *npcol=((gridinfo3d_t *) *grid)->npcol;
+  *nprow=((gridinfo3d_t *) *grid)->nprow;
+  *npdep=((gridinfo3d_t *) *grid)->npdep;
+}
+
+void f_get_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   *nrow = ((SuperMatrix *) *A)->nrow;
+   *ncol = ((SuperMatrix *) *A)->ncol;
+}
+
+void f_set_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   ((SuperMatrix *) *A)->nrow = *nrow;
+   ((SuperMatrix *) *A)->ncol = *ncol;
+}
+
+void f_get_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  *m=((SuperMatrix *) *A)->nrow;
+  *n=((SuperMatrix *) *A)->ncol;
+  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
+  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
+  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
+}
+
+void f_set_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  NRformat_loc *Astore = ((SuperMatrix *) *A)->Store;
+
+  ((SuperMatrix *) *A)->nrow = *m;
+  ((SuperMatrix *) *A)->ncol = *n;
+  Astore->m_loc = *m_loc;
+  Astore->nnz_loc = *nnz_loc;
+  Astore->fst_row = *fst_row;
+}
+
+void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
+   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
+   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
+   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
+   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
+   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
+   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
+   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
+   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
+   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
+   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
+}
+
+void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
+    l_options->Fact = (fact_t) *Fact;
+   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
+   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
+   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
+   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
+   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
+   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
+   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
+   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
+   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
+   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
+}
+
+/* wrappers for SuperLU functions */
+
+void f_set_default_options(fptr *options)
+{
+   set_default_options_dist((superlu_dist_options_t *) *options);
+}
+
+void f_superlu_gridinit(int *Bcomm, int *nprow, int *npcol, fptr *grid)
+{
+   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridinit3d(int *Bcomm, int *nprow, int *npcol,
+   			  int *npdep, fptr *grid)
+{
+    superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid);
+}
+
+void f_superlu_gridmap(int *Bcomm, int *nprow, int *npcol, 
+                       int *usermap, int *ldumap, fptr *grid)
+{
+   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap,
+		   (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridexit(fptr *grid)
+{
+   superlu_gridexit((gridinfo_t *) *grid);
+}
+
+void f_PStatInit(fptr *stat)
+{
+   PStatInit((SuperLUStat_t *) *stat);
+}
+
+void f_PStatFree(fptr *stat)
+{
+   PStatFree((SuperLUStat_t *) *stat);
+}
+
+void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+{
+   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+}
+
+void f_Destroy_SuperMat_Store_dist(fptr *A)
+{
+   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+}
+
+/* Check malloc */
+
+void f_check_malloc(int *iam)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(*iam, "Check Malloc");
+#endif
+}
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
index ee963113..70f66607 100644
--- a/FORTRAN/superlu_c2f_zwrap.c
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -3,287 +3,133 @@
  * \brief C interface functions for the Fortran90 wrapper.
  *
  * 
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
 #include "superlu_FCnames.h"
 
-/* kind of integer to hold a pointer.  Use int.
-   This might need to be changed on systems with large memory.
-   If changed, be sure to change it in superlupara.f90 too */
 
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
 #if 0
 typedef int fptr;  /* 32-bit */
 #else
 typedef long long int fptr;  /* 64-bit */
 #endif
 
-
-/* some MPI implementations may require conversion between a Fortran
-   communicator and a C communicator.  This routine is used to perform the
-   conversion.  It may need different forms for different MPI libraries. */
-
-/* NO_MPI2 should be defined on the compiler command line if the MPI
-   library does not provide MPI_Comm_f2c */
-
-MPI_Comm f2c_comm(int *f_comm)
-{
-#ifndef NO_MPI2
-
-/* MPI 2 provides a standard way of doing this */
-   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
-#else
-
-/* will probably need some special cases here */
-/* when in doubt, just return the input */
-   return (MPI_Comm)(*f_comm);
-#endif
-}
-
-
 /* functions that create memory for a struct and return a handle */
 
-void f_create_gridinfo_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
-}
-
-void f_create_options_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
-}
-
-void f_create_ScalePerm_handle(fptr *handle)
+void f_zcreate_ScalePerm_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zScalePermstruct_t));
 }
 
-void f_create_LUstruct_handle(fptr *handle)
+void f_zcreate_LUstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zLUstruct_t));
 }
 
-void f_create_SOLVEstruct_handle(fptr *handle)
+void f_zcreate_SOLVEstruct_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(zSOLVEstruct_t));
 }
 
-void f_create_SuperMatrix_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
-}
-
-void f_create_SuperLUStat_handle(fptr *handle)
-{
-   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
-}
-
-/* functions that free the memory allocated by the above functions */
-
-void f_destroy_gridinfo_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_options_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_ScalePerm_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_LUstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SOLVEstruct_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperMatrix_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-void f_destroy_SuperLUStat_handle(fptr *handle)
-{
-   SUPERLU_FREE((void *)*handle);
-}
-
-/* functions that get or set values in a C struct.
-   This is not the complete set of structs for which a user might want
-   to get/set a component, and there may be missing components. */
-
-void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
-{
-  *iam=((gridinfo_t *) *grid)->iam;
-  *npcol=((gridinfo_t *) *grid)->npcol;
-  *nprow=((gridinfo_t *) *grid)->nprow;
-}
-
-void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   *nrow = ((SuperMatrix *) *A)->nrow;
-   *ncol = ((SuperMatrix *) *A)->ncol;
-}
-
-void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
-{
-   ((SuperMatrix *) *A)->nrow = *nrow;
-   ((SuperMatrix *) *A)->ncol = *ncol;
-}
-
-void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  *m=((SuperMatrix *) *A)->nrow;
-  *n=((SuperMatrix *) *A)->ncol;
-  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
-  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
-  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
-}
-
-void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-			     int_t *m_loc, int_t *fst_row)
-{
-  ((SuperMatrix *) *A)->nrow = *m;
-  ((SuperMatrix *) *A)->ncol = *n;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc;
-  ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row;
-}
-
-void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
-   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
-   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
-   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
-   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
-   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
-   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
-   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
-   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
-   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
-   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
-}
-
-void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
-                           int *ColPerm, int *RowPerm, int *IterRefine,
-			   int *Trans, int *ReplaceTinyPivot,
-			   int *SolveInitialized, int *RefineInitialized,
-			   int *PrintStat)
-{
-    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
-    l_options->Fact = (fact_t) *Fact;
-   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
-   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
-   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
-   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
-   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
-   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
-   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
-   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
-   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
-   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
-}
-
 /* wrappers for SuperLU functions */
 
-void f_set_default_options(fptr *options)
-{
-   set_default_options_dist((superlu_dist_options_t *) *options);
-}
-
-void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
-{
-  
-   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
-{
-   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
-}
-
-void f_superlu_gridexit(fptr *grid)
-{
-   superlu_gridexit((gridinfo_t *) *grid);
-}
-
-void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct)
+void f_zScalePermstructInit(int *m, int *n, fptr *ScalePermstruct)
 {
    zScalePermstructInit(*m, *n, (zScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_ScalePermstructFree(fptr *ScalePermstruct)
+void f_zScalePermstructFree(fptr *ScalePermstruct)
 {
    zScalePermstructFree((zScalePermstruct_t *) *ScalePermstruct);
 }
 
-void f_PStatInit(fptr *stat)
-{
-   PStatInit((SuperLUStat_t *) *stat);
-}
-
-void f_PStatFree(fptr *stat)
-{
-   PStatFree((SuperLUStat_t *) *stat);
-}
-
-void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct)
+void f_zLUstructInit(int *m, int *n, fptr *LUstruct)
 {
    extern void zLUstructInit(const int_t, zLUstruct_t *);
 
    zLUstructInit(*m, (zLUstruct_t *) *LUstruct);
 }
 
-void f_LUstructFree(fptr *LUstruct)
+void f_zLUstructFree(fptr *LUstruct)
 {
    extern void zLUstructFree(zLUstruct_t *);
 
    zLUstructFree((zLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_zDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    zLUstructFree((zLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+    }
 }
 
-void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
-				   int_t *m_loc, int_t *fst_row, doublecomplex *nzval,
-				   int_t *colind, int_t *rowptr, int *stype,
-				   int *dtype, int *mtype)
+void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
 {
-   zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
-                                  *fst_row, (doublecomplex *) nzval, colind, rowptr,
-                                  (Stype_t) *stype, (Dtype_t) *dtype,
-                                  (Mtype_t) *mtype);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zLUstruct_t *LUstruct_ptr = (zLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	zDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+    	zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+    } else { // process layers not equal 0
+        zDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        zDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    zLUstructFree(LUstruct_ptr);
 }
 
-void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+void f_zDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
-   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+    zDestroy_A3d_gathered_on_2d((zSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
 }
 
-void f_Destroy_SuperMat_Store_dist(fptr *A)
+
+void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
+				   int *m_loc, int *fst_row, doublecomplex *nzval,
+				   int_t *colind, int_t *rowptr, int *stype,
+				   int *dtype, int *mtype)
 {
-   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+#if 1
+    doublecomplex *C_nzval = nzval;
+    int_t *C_colind = colind;
+    int_t *C_rowptr = rowptr;
+#else
+    /* make a copy of matrix A that is internal to the C side */
+    doublecomplex *C_nzval = doublecomplexMalloc_dist(*nnz_loc);
+    int_t *C_colind = intMalloc_dist(*nnz_loc);
+    int_t *C_rowptr = intMalloc_dist(*m_loc + 1);
+    int i;
+    
+    for (i = 0; i < *nnz_loc; ++i) {
+        C_nzval[i] = nzval[i];
+        C_colind[i] = colind[i];
+    }
+    for (i = 0; i <= *m_loc; ++i) {
+        C_rowptr[i] = rowptr[i];
+    }
+#endif
+
+    zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc,
+                                  *fst_row, C_nzval, C_colind, C_rowptr,
+                                  (Stype_t) *stype, (Dtype_t) *dtype,
+                                  (Mtype_t) *mtype);
 }
 
 void f_zSolveFinalize(fptr *options, fptr *SOLVEstruct)
@@ -306,26 +152,64 @@ void f_pzgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, doublecomplex *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_zcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   doublecomplex *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pzgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 doublecomplex *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int zcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
-			   int_t * , int_t *, gridinfo_t *);
-
-   zcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (doublecomplex *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pzgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (zScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (zLUstruct_t *) *LUstruct,
+	      (zSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
 
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
 }
 
-/* Check malloc */
+/* Create the distributed matrix */
 
-void f_check_malloc(int *iam)
-{
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC((int_t) *iam, "Check Malloc");
-#endif
+void f_zcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+    extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol);
+
+    int iam, nprocs;
+    int nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_zcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+    extern void f_get_gridinfo3d(fptr *grid, int *iam,
+                                 int *nprow, int *npcol, int *npdep);
+
+    int iam, nprocs;
+    int nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
+
diff --git a/FORTRAN/superlu_mod.f90 b/FORTRAN/superlu_mod.f90
index bdfa8191..4a70a8f4 100644
--- a/FORTRAN/superlu_mod.f90
+++ b/FORTRAN/superlu_mod.f90
@@ -1,7 +1,12 @@
 !> @file
 !! \brief This module contains Fortran-side wrappers for the SuperLU
 !! get/set functions.
-!
+!!
+!! 
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! 
module superlu_mod @@ -16,13 +21,18 @@ module superlu_mod implicit none contains -subroutine get_GridInfo(grid, iam, nprow, npcol) - integer(superlu_ptr) :: grid +subroutine get_GridInfo(grid, iam, nprow, npcol, npdep) + integer(superlu_ptr) :: grid !! can be 2D or 3D grid integer*4, optional :: iam - integer, optional :: nprow, npcol - integer :: l_iam, l_nprow, l_npcol - - call f_get_gridinfo(grid, l_iam, l_nprow, l_npcol) + integer, optional :: nprow, npcol, npdep + integer :: l_iam, l_nprow, l_npcol, l_npdep + + if (present(npdep)) then + call f_get_gridinfo3d(grid, l_iam, l_nprow, l_npcol, l_npdep) + npdep = l_npdep + else + call f_get_gridinfo(grid, l_iam, l_nprow, l_npcol) + endif if (present(iam)) iam = l_iam if (present(nprow)) nprow = l_nprow diff --git a/FORTRAN/superlupara.f90 b/FORTRAN/superlupara.f90 index d246ae88..7ae58251 100644 --- a/FORTRAN/superlupara.f90 +++ b/FORTRAN/superlupara.f90 @@ -2,6 +2,11 @@ !! \brief This module contains some parameter used in SuperLU for !! Fortran90 user. ! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! 
module superlupara_mod @@ -14,7 +19,7 @@ module superlupara_mod public superlu_ptr !---------------------------------------------------- -! kind of integer to hold a SuperLU pointer. Use default integer. +! kind of integer to hold a SuperLU pointer. Use 64-bit integer. ! This might need to be changed on systems with large memory. ! If changed, be sure to change it in superlu_c2f_wrap.c too. ! @@ -24,8 +29,8 @@ module superlupara_mod !---------------------------------------------------- ! The following parameters are defined: -! These values come from superlu_defs.h. If the values in there change with -! the version of SuperLU, then they need to be changed here, too. +! These values come from superlu_enum_consts.h. If the values in there +! change, then they need to be changed here, too. integer, parameter, public :: & NO = 0, & ! yes_no_t @@ -35,8 +40,9 @@ module superlupara_mod SamePattern_SameRowPerm = 2, & FACTORED = 3, & NOROWPERM = 0, & ! rowperm_t - LargeDiag = 1, & - MY_PERMR = 2, & + LargeDiag_MC64 = 1, & + LargeDiag_HWPM = 2, & + MY_PERMR = 3, & NATURAL = 0, & ! colperm_t MMD_ATA = 1, & MMD_AT_PLUS_A = 2, & @@ -53,15 +59,26 @@ module superlupara_mod COL = 2, & BOTH = 3, & NOREFINE = 0, & ! IterRefine_t - SINGLE = 1, & - DOUBLE = 2, & - EXTRA = 3, & - LUSUP = 0, & ! MemType Need? - UCOL = 1, & - LSUB = 2, & - USUB = 3, & - SYSTEM = 0, & ! LU_space_t Need? - USER = 1 + SLU_SINGLE = 1, & + SLU_DOUBLE = 2, & + SLU_EXTRA = 3, & + USUB = 0, & ! MemType + LSUB = 1, & + UCOL = 2, & + LUSUP = 3, & + LLVL = 4, & + ULVL = 5, & + NO_MEMTYPE = 6, & + SYSTEM = 0, & ! LU_space_t + USER = 1, & + SILU = 0, & ! milu_t + SMILU_1 = 1, & + SMILU_2 = 2, & + SMILU_3 = 3 + +! These values come from supermatrix.h. If the values in there +! change, then they need to be changed here, too. + integer, parameter, public :: & SLU_NC = 0, & ! Stype_t SLU_NCP = 1, & @@ -85,7 +102,6 @@ module superlupara_mod SLU_HEL = 7, & SLU_HEU = 8 - !---------------------------------------------------- end module superlupara_mod diff --git a/README.md b/README.md index 2b3a7de6..cbafbbf9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# SuperLU_DIST (version 6.4) superlu +# SuperLU_DIST (version 7.1.1) superlu [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) [Nightly tests](http://my.cdash.org/index.php?project=superlu_dist) @@ -11,15 +11,69 @@ to run accurately and efficiently on large numbers of processors. SuperLU_DIST is a parallel extension to the serial SuperLU library. It is targeted for the distributed memory parallel machines. -SuperLU_DIST is implemented in ANSI C, and MPI for communications. -Currently, the LU factorization and triangular solution routines, -which are the most time-consuming part of the solution process, -are parallelized. The other routines, such as static pivoting and -column preordering for sparsity are performed sequentially. -This "alpha" release contains double-precision real and double-precision -complex data types. +SuperLU_DIST is implemented in ANSI C, with OpenMP for on-node parallelism +and MPI for off-node communications. We are actively developing GPU +acceleration capabilities. + + + + + + + +Table of Contents +================= + +* [SuperLU_DIST (version 7.1.1) superlu](#superlu_dist-version-70---) +* [Directory structure of the source code](#directory-structure-of-the-source-code) +* [Installation](#installation) + * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system) + * [Dependent external libraries: BLAS and ParMETIS](#dependent-external-libraries-blas-and-parmetis) + * [Optional external libraries: CombBLAS, LAPACK](#optional-external-libraries-combblas-lapack) + * [Use GPU](#use-gpu) + * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions) + * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile) + * [2.1 Edit the make.inc include file.](#21-edit-the-makeinc-include-file) + * [2.2. The BLAS library.](#22-the-blas-library) + * [2.3. External libraries.](#23-external-libraries) + * [2.3.1 Metis and ParMetis.](#231-metis-and-parmetis) + * [2.3.2 LAPACK.](#232-lapack) + * [2.3.3 CombBLAS.](#233-combblas) + * [2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#24-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface) + * [2.5. Multicore and GPU.](#25-multicore-and-gpu) +* [Summary of the environment variables.](#summary-of-the-environment-variables) +* [Windows Usage](#windows-usage) +* [Reading sparse matrix files](#reading-sparse-matrix-files) +* [REFERENCES](#references) +* [RELEASE VERSIONS](#release-versions) + +Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc) + +# SuperLU_DIST (version 7.0) superlu + +[![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) +[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist) + +SuperLU_DIST contains a set of subroutines to solve a sparse linear system +A*X=B. It uses Gaussian elimination with static pivoting (GESP). +Static pivoting is a technique that combines the numerical stability of +partial pivoting with the scalability of Cholesky (no pivoting), +to run accurately and efficiently on large numbers of processors. + +SuperLU_DIST is a parallel extension to the serial SuperLU library. +It is targeted for the distributed memory parallel machines. +SuperLU_DIST is implemented in ANSI C, with OpenMP for on-node parallelism +and MPI for off-node communications. We are actively developing GPU +acceleration capabilities. + + + + + + -### The distribution contains the following directory structure: + +# Directory structure of the source code ``` SuperLU_DIST/README instructions on installation @@ -28,6 +82,7 @@ SuperLU_DIST/CBLAS/ needed BLAS routines in C, not necessarily fast library with multiple OpenMP threads, performance relies on a good multithreaded BLAS implementation.) SuperLU_DIST/DOC/ the Users' Guide +SuperLU_DIST/FORTRAN/ Fortran90 wrapper functions SuperLU_DIST/EXAMPLE/ example programs SuperLU_DIST/INSTALL/ test machine dependent parameters SuperLU_DIST/SRC/ C source code, to be compiled into libsuperlu_dist.a @@ -41,35 +96,71 @@ SuperLU_DIST/make.inc compiler, compiler flags, library definitions and C SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files ``` -## INSTALLATION +# Installation -There are two ways to install the package. One requires users to -edit makefile manually, the other uses CMake automatic build system. +There are two ways to install the package. The first method is to use +CMake automatic build system. The other method requires users to The procedures are described below. -### Installation option 1: Using CMake build system. +## Installation option 1: Using CMake build system. You will need to create a build tree from which to invoke CMake. -First, in order to use parallel symbolic factorization function, you +### Dependent external libraries: BLAS and ParMETIS +If you have a BLAS library on your machine, you can link with it +with the following cmake definition: +``` +-DTPL_BLAS_LIBRARIES="" +``` +Otherwise, the CBLAS/ subdirectory contains the part of the C BLAS +(single threaded) needed by SuperLU_DIST, but they are not optimized. +You can compile and use it with the following cmake definition: +``` +-DTPL_ENABLE_INTERNAL_BLASLIB=ON +``` + +The default sparsity ordering is METIS. But, in order to use parallel +symbolic factorization function, you need to install ParMETIS parallel ordering package and define the two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR +(Note: ParMETIS library also contains serial METIS library.) + ``` export PARMETIS_ROOT= export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 ``` -Second, in order to use parallel weighted matching AWPM for numerical -pre-pivoting, you need to install CombBLAS and define the environment -variable: +### Optional external libraries: CombBLAS, LAPACK + +In order to use parallel weighted matching HWPM (Heavy Weight +Perfect Matching) for numerical pre-pivoting, you need to install +CombBLAS and define the environment variable: ``` export COMBBLAS_ROOT= export COMBBLAS_BUILD_DIR=${COMBBLAS_ROOT}/_build ``` +Then, install with cmake option: +``` +-DTPL_ENABLE_COMBBLASLIB=ON +``` -Once these needed third-party libraries are in place, SuperLU installation -can be done as follows from the top level directory: +By default, LAPACK is not needed. Only in triangular solve routine, we +may use LAPACK to explicitly invert the dense diagonal block to improve +speed. You can use it with the following cmake option: +``` +-DTPL_ENABLE_LAPACKLIB=ON +``` + +### Use GPU +You can enable GPU with CUDA with the following cmake option: +``` +-DTPL_ENABLE_CUDALIB=TRUE +-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so" +``` + +Once these needed third-party libraries are in place, the installation +can be done as follows at the top level directory: For a simple installation with default setting, do: (ParMETIS is needed, i.e., TPL_ENABLE_PARMETISLIB=ON) @@ -79,8 +170,7 @@ cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \ ``` - -For a more sophisticated installation including third-part libraries, do: +For a more sophisticated installation including third-party libraries, do: ``` cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ @@ -99,17 +189,11 @@ OOT}/Applications/BipartiteMatchings" \ ( see example cmake script: run_cmake_build.sh ) ``` -You can enable GPU with CUDA with the following cmake option: -``` -`-DTPL_ENABLE_CUDALIB=TRUE` -`-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so"` -``` -You can disable LAPACK, ParMetis or CombBLAS with the following cmake option: +You can disable CombBLAS or LAPACK with the following cmake options: ``` -`-DTPL_ENABLE_LAPACKLIB=FALSE` -`-DTPL_ENABLE_PARMETISLIB=FALSE` -`-DTPL_ENABLE_COMBBLASLIB=FALSE` +-DTPL_ENABLE_LAPACKLIB=FALSE +-DTPL_ENABLE_COMBBLASLIB=FALSE ``` To actually build (compile), type: @@ -133,25 +217,38 @@ execution may fail. You can pass the definition option "-DMPIEXEC_EXECUTABLE" to cmake. For example on Cori at NERSC, you will need the following: `-DMPIEXEC_EXECUTABLE=/usr/bin/srun` -Or, you can always go to TEST/ directory to perform -testing manually. - -**Note on the C-Fortran name mangling handled by C preprocessor definition:** -In the default setting, we assume that Fortran expects a C routine -to have an underscore postfixed to the name. Depending on the -compiler, you may need to define one of the following flags in -during the cmake build to overwrite default setting: -``` -cmake .. -DCMAKE_C_FLAGS="-DNoChange" -cmake .. -DCMAKE_C_FLAGS="-DUpCase" -``` - - -### Installation option 2: Manual installation with makefile. +Or, you can always go to TEST/ directory to perform testing manually. + +### Summary of the CMake definitions. +The following list summarize the commonly used CMake definitions. In each case, +the first choice is the default setting. After running 'cmake' installation, +a configuration header file is generated in SRC/superlu_dist_config.h, which +contains the key CPP definitions used throughout the code. +``` + -TPL_ENABLE_PARMETISLIB=ON | OFF + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON + -DTPL_ENABLE_LAPACKLIB=OFF | ON + -TPL_ENABLE_COMBBLASLIB=OFF + -DTPL_ENABLE_CUDALIB=OFF | ON + -Denable_complex16=OFF | ON + -DXSDK_INDEX_SIZE=32 | 64 + + -DBUILD_SHARED_LIBS= OFF | ON + -DCMAKE_INSTALL_PREFIX=<...>. + -DCMAKE_C_COMPILER= + -DCMAKE_C_FLAGS="..." + -DCMAKE_CXX_COMPILER= + -DMAKE_CXX_FLAGS="..." + -DCMAKE_CUDA_FLAGS="..." + -DXSDK_ENABLE_Fortran=OFF | ON + -DCMAKE_Fortran_COMPILER= +``` + +## Installation option 2: Manual installation with makefile. Before installing the package, please examine the three things dependent on your system setup: -#### 1.1 Edit the make.inc include file. +### 2.1 Edit the make.inc include file. This make include file is referenced inside each of the Makefiles in the various subdirectories. As a result, there is no need to @@ -183,7 +280,7 @@ printing level to show solver's execution details. (default 0) diagnostic printing level for debugging purpose. (default 0) ``` -#### 1.2. The BLAS library. +### 2.2. The BLAS library. The parallel routines in SuperLU_DIST use some BLAS routines on each MPI process. Moreover, if you enable OpenMP with multiple threads, you need to @@ -210,22 +307,9 @@ top-level SuperLU_DIST/ directory and do the following: to make the BLAS library from the routines in the ` CBLAS/ subdirectory.` -#### 1.3. External libraries. - - ##### 1.3.1 LAPACK. - Starting Version 6.0, the triangular solve routine can perform explicit - inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine. - To use this feature, you should define the following in make.inc: -``` -SLU_HAVE_LAPACK = TRUE -LAPACKLIB = -``` -You can disable LAPACK with the following line in SRC/superlu_dist_config.h: -``` -#undef SLU_HAVE_LAPACK -``` +### 2.3. External libraries. - ##### 1.3.2 Metis and ParMetis. + #### 2.3.1 Metis and ParMetis. If you will use Metis or ParMetis for sparsity ordering, you will need to install them yourself. Since ParMetis package already @@ -243,9 +327,21 @@ I_PARMETIS = -I/include -I/metis/include You can disable ParMetis with the following line in SRC/superlu_dist_config.h: ``` #undef HAVE_PARMETIS +``` + #### 2.3.2 LAPACK. + Starting Version 6.0, the triangular solve routine can perform explicit + inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine. + To use this feature, you should define the following in make.inc: +``` +SLU_HAVE_LAPACK = TRUE +LAPACKLIB = +``` +You can disable LAPACK with the following line in SRC/superlu_dist_config.h: +``` +#undef SLU_HAVE_LAPACK ``` - ##### 1.3.3 CombBLAS. + #### 2.3.3 CombBLAS. You can use parallel approximate weight perfect matching (AWPM) algorithm to perform numerical pre-pivoting for stability. The default pre-pivoting @@ -265,10 +361,9 @@ You can disable CombBLAS with the following line in SRC/superlu_dist_config.h: #undef HAVE_COMBBLAS ``` +### 2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.) -#### 1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.) - -In the header file SRC/Cnames.h, we use macros to determine how +In the header file SRC/superlu_Cnames.h, we use macros to determine how C routines should be named so that they are callable by Fortran. (Some vendor-supplied BLAS libraries do not have C interfaces. So the re-naming is needed in order for the SuperLU BLAS calls (in C) to @@ -283,7 +378,7 @@ The possible options for CDEFS are: -DUpCase: Fortran expects a C routine name to be all uppercase. ``` -#### 1.5. Multicore and GPU (optional). +### 2.5. Multicore and GPU. To use OpenMP parallelism, need to link with an OpenMP library, and set the number of threads you wish to use as follows (bash): @@ -302,8 +397,25 @@ A Makefile is provided in each subdirectory. The installation can be done completely automatically by simply typing "make" at the top level. +# Summary of the environment variables. +A couple of environment variables affect parallel execution. +``` + export OMP_NUM_THREADS=<...> + export SUPERLU_ACC_OFFLOAD=1 // this enables use of GPU. Default is 0. +``` +Several integer blocking parameters may affect performance. Most of them can be +set by the user through environment variables. Oherwise the default values +are provided. Various SuperLU routines call an environment inquiry function +to obtain these parameters. This function is provided in the file SRC/sp_ienv.c. +Please consult that file for detailed description of the meanings. +``` + export NREL=<...> // supernode relaxation parameter + export NSUP=<...> // maximum allowable supernode size, not to exceed 512 + export FILL=<...> // estimated fill ratio of nonzeros(L+U)/nonzeros(A) + export MAX_BUFFER_SIZE=<...> // maximum buffer size on GPU for GEMM +``` -## Windows Usage +# Windows Usage Prerequisites: CMake, Visual Studio, Microsoft HPC Pack This has been tested with Visual Studio 2017, without Parmetis, without Fortran, and with OpenMP disabled. @@ -337,7 +449,7 @@ for the above configuration. If you wish to test: `ctest` -## READING SPARSE MATRIX FILES +# Reading sparse matrix files The SRC/ directory contains the following routines to read different file formats, they all have the similar calling sequence. @@ -350,7 +462,7 @@ dreadtriple.c : triplet, with header dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab ``` -## REFERENCES +# REFERENCES **[1]** X.S. Li and J.W. Demmel, "SuperLU_DIST: A Scalable Distributed-Memory Sparse Direct Solver for Unsymmetric Linear Systems", ACM Trans. on Math. @@ -363,24 +475,34 @@ dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab Porto, Portugal. **[4]** P. Sao, X.S. Li, R. Vuduc, “A Communication-Avoiding 3D Factorization for Sparse Matrices”, Proc. of IPDPS, May 21–25, 2018, Vancouver. -**[5]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable +**[5]** P. Sao, R. Vuduc, X. Li, "Communication-avoiding 3D algorithm for + sparse LU factorization on heterogeneous systems", J. Parallel and + Distributed Computing (JPDC), September 2019. +**[6]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable distributed-memory sparse triangular solution algorithms”, Proc. of SIAM workshop on Combinatorial Scientific Computing, June 6-8, 2018, - Bergen, Norway. + Bergen, Norway. +**[7]** N. Ding, S. Williams, Y. Liu, X.S. Li, "Leveraging One-Sided + Communication for Sparse Triangular Solvers", Proc. of SIAM Conf. on + Parallel Processing for Scientific Computing. Feb. 12-15, 2020. +**[8]** A. Azad, A. Buluc, X.S. Li, X. Wang, and J. Langguth, +"A distributed-memory algorithm for computing a heavy-weight perfect matching +on bipartite graphs", SIAM J. Sci. Comput., Vol. 42, No. 4, pp. C143-C168, 2020. -**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov) + +**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov) **Gustavo Chavez**, Lawrence Berkeley National Lab, [gichavez@lbl.gov](gichavez@lbl.gov) +**Nan Ding**, Lawrence Berkeley National Lab, [nanding@lbl.gov](nanding@lbl.gov) **Laura Grigori**, INRIA, France, [laura.grigori@inria.fr](laura.grigori@inria.fr) **Yang Liu**, Lawrence Berkeley National Lab, [liuyangzhuan@lbl.gov](liuyangzhuan@lbl.gov) -**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov) **Piyush Sao**, Georgia Institute of Technology, [piyush.feynman@gmail.com](piyush.feynman@gmail.com) +**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov) **Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki@gmail.com](ic.yamazaki@gmail.com) -**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu) +**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu) **John Gilbert**, UC Santa Barbara, [gilbert@cs.ucsb.edu](gilbert@cs.ucsb.edu) - -## RELEASE VERSIONS +# RELEASE VERSIONS ``` October 15, 2003   Version 2.0 October 1, 2007   Version 2.1 @@ -406,4 +528,5 @@ February 8, 2019 Version 6.1.1 November 12, 2019 Version 6.2.0 February 23, 2020 Version 6.3.0 October 23, 2020 Version 6.4.0 +May 10, 2021 Version 7.0.0 ``` diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index c2eeb9fe..00ebb8b8 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -42,16 +42,26 @@ set(sources colamd.c superlu_dist_version.c comm_tree.c + superlu_grid3d.c ## 3D code + supernodal_etree.c + supernodalForest.c + trfAux.c + communication_aux.c + treeFactorization.c + sec_structs.c ) +if (HAVE_CUDA) + list(APPEND sources superlu_gpu_utils.cu) +endif() + if (MSVC) list(APPEND sources wingetopt.c) endif () - set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0) if(enable_double) - list(APPEND headers superlu_ddefs.h) + list(APPEND headers superlu_ddefs.h dlustruct_gpu.h) if(TPL_ENABLE_CUDALIB) list(APPEND sources pdgstrs_lsum_cuda.cu) @@ -97,16 +107,94 @@ endif() pdgsrfs_ABXglobal.c pdgsmv_AXglobal.c pdGetDiagU.c + pdgssvx3d.c ## 3D code + dnrformat_loc3d.c + pdgstrf3d.c + dtreeFactorization.c + dtreeFactorizationGPU.c + dgather.c + dscatter3d.c + pd3dcomm.c + dtrfAux.c + dcommunication_aux.c + dtrfCommWrapper.c + dsuperlu_blas.c ) +if (HAVE_CUDA) + list(APPEND sources dsuperlu_gpu.cu) +endif() + if (HAVE_COMBBLAS) list(APPEND sources d_c2cpp_GetHWPM.cpp dHWPM_CombBLAS.hpp) endif() -endif() ## enable double +endif() ########## enable double + +if(enable_single) + list(APPEND headers superlu_sdefs.h slustruct_gpu.h) + + list(APPEND sources + slangs_dist.c + sgsequ_dist.c + slaqgs_dist.c + sutil_dist.c + smemory_dist.c + smyblas2_dist.c + ssp_blas2_dist.c + ssp_blas3_dist.c + psgssvx.c + psgssvx_ABglobal.c + sreadhb.c + sreadrb.c + sreadtriple.c + sreadtriple_noheader.c + sbinary_io.c + sreadMM.c + psgsequ.c + pslaqgs.c + sldperm_dist.c + pslangs.c + psutil.c + pssymbfact_distdata.c + sdistribute.c + psdistribute.c + psgstrf.c + sstatic_schedule.c + psgstrf2.c + psgstrs.c + psgstrs1.c + psgstrs_lsum.c + psgstrs_Bglobal.c + psgsrfs.c + psgsmv.c + psgsrfs_ABXglobal.c + psgsmv_AXglobal.c + psGetDiagU.c + psgssvx3d.c ## 3D code + snrformat_loc3d.c + psgstrf3d.c + streeFactorization.c + streeFactorizationGPU.c + sgather.c + sscatter3d.c + ps3dcomm.c + strfAux.c + scommunication_aux.c + strfCommWrapper.c + ssuperlu_blas.c + ) +if (HAVE_CUDA) + list(APPEND sources ssuperlu_gpu.cu) +endif() +if (HAVE_COMBBLAS) + list(APPEND sources s_c2cpp_GetHWPM.cpp sHWPM_CombBLAS.hpp) +endif() + +endif() ########## enable single if(enable_complex16) - list(APPEND headers superlu_zdefs.h) + list(APPEND headers superlu_zdefs.h zlustruct_gpu.h) list(APPEND sources dcomplex_dist.c @@ -146,37 +234,64 @@ if(enable_complex16) pzgsrfs_ABXglobal.c pzgsmv_AXglobal.c pzGetDiagU.c + pzgssvx3d.c ## 3D code + znrformat_loc3d.c + pzgstrf3d.c + ztreeFactorization.c + ztreeFactorizationGPU.c + zscatter3d.c + zgather.c + pz3dcomm.c ztrfAux.c + zcommunication_aux.c + ztrfCommWrapper.c + zsuperlu_blas.c ) -if (HAVE_COMBBLAS) - list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp) +if (HAVE_CUDA) + list(APPEND sources zsuperlu_gpu.cu) endif() +if (HAVE_COMBBLAS) + list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp) endif() +endif() ######### enable compex16 if (TPL_ENABLE_HIPLIB) + file(GLOB MyFiles *.hip.cpp) set_source_files_properties( - pdgstrs_lsum_cuda.hip.cpp + ${MyFiles} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + hip_add_library(superlu_dist "pdgstrs_lsum_cuda.hip.cpp") + hip_add_library(superlu_dist "superlu_gpu_utils.hip.cpp") + hip_add_library(superlu_dist "dsuperlu_gpu.hip.cpp") + if(enable_single) + hip_add_library(superlu_dist "ssuperlu_gpu.hip.cpp") + endif() + if(enable_complex16) + hip_add_library(superlu_dist "zsuperlu_gpu.hip.cpp") + endif() else() add_library(superlu_dist "") endif() -# if (BUILD_SHARED_LIBS) -# set_property(TARGET superlu_dist PROPERTY POSITION_INDEPENDENT_CODE ON) -# endif() if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) if (TPL_ENABLE_HIPLIB) hip_add_library(superlu_dist-static STATIC "pdgstrs_lsum_cuda.hip.cpp") + hip_add_library(superlu_dist-static STATIC "dsuperlu_gpu.hip.cpp") + if(enable_single) + hip_add_library(superlu_dist-static STATIC "ssuperlu_gpu.hip.cpp") + endif() + if(enable_complex16) + hip_add_library(superlu_dist-static STATIC "zsuperlu_gpu.hip.cpp") + endif() else() add_library(superlu_dist-static STATIC "") endif() endif() -# if (BUILD_SHARED_LIBS) -# set_property(TARGET superlu_dist-static PROPERTY POSITION_INDEPENDENT_CODE ON) -# endif() + target_sources(superlu_dist PRIVATE ${sources} ${HEADERS}) set(targets superlu_dist) + if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) # build both shared and static libs target_sources(superlu_dist-static PRIVATE ${sources} ${HEADERS}) @@ -209,6 +324,17 @@ foreach(target ${targets}) endif() endforeach(target) +# Add CUDA runtime library and CUBLAS library +if(CUDAToolkit_FOUND) # this is found in top-level CMakeLists.txt + target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas) +endif() + +# This is recommended by modern cmake: +# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html +if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt + target_link_libraries(superlu_dist OpenMP::OpenMP_C) +endif() + target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS) if(MSVC AND BUILD_SHARED_LIBS) set_target_properties(superlu_dist PROPERTIES diff --git a/SRC/Cnames.h b/SRC/Cnames.h deleted file mode 100644 index a4d1672d..00000000 --- a/SRC/Cnames.h +++ /dev/null @@ -1,378 +0,0 @@ -/*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) - -All rights reserved. - -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ -/*! @file - * \brief Macro definitions - * - *
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- * 
- */ - -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define NOCHANGE 1 -#define UPCASE 2 -#define C_CALL 3 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle_ -#define f_create_options_handle f_create_options_handle_ -#define f_create_ScalePerm_handle f_create_scaleperm_handle_ -#define f_create_LUstruct_handle f_create_lustruct_handle_ -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ -#define f_create_SuperMatrix_handle f_create_supermatrix_handle_ -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ -#define f_destroy_options_handle f_destroy_options_handle_ -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ -#define f_create_SuperLUStat_handle f_create_superlustat_handle_ -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ -#define f_get_gridinfo f_get_gridinfo_ -#define f_get_SuperMatrix f_get_supermatrix_ -#define f_set_SuperMatrix f_set_supermatrix_ -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ -#define f_get_superlu_options f_get_superlu_options_ -#define f_set_superlu_options f_set_superlu_options_ -#define f_set_default_options f_set_default_options_ -#define f_superlu_gridinit f_superlu_gridinit_ -#define f_superlu_gridmap f_superlu_gridmap_ -#define f_superlu_gridexit f_superlu_gridexit_ -#define f_ScalePermstructInit f_scalepermstructinit_ -#define f_ScalePermstructFree f_scalepermstructfree_ -#define f_PStatInit f_pstatinit_ -#define f_PStatFree f_pstatfree_ -#define f_LUstructInit f_lustructinit_ -#define f_LUstructFree f_lustructfree_ -#define f_Destroy_LU f_destroy_lu_ -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ -#define f_zCreate_CompRowLoc_Mat_dist f_zcreate_comprowloc_mat_dist_ -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ -#define f_dSolveFinalize f_dsolvefinalize_ -#define f_zSolveFinalize f_zsolvefinalize_ -#define f_pdgssvx f_pdgssvx_ -#define f_pzgssvx f_pzgssvx_ -#define f_dcreate_dist_matrix f_dcreate_dist_matrix_ -#define f_zcreate_dist_matrix f_zcreate_dist_matrix_ -#define f_check_malloc f_check_malloc_ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ -/* BLAS */ -#define sasum_ SASUM -#define isamax_ ISAMAX -#define scopy_ SCOPY -#define sscal_ SSCAL -#define sger_ SGER -#define snrm2_ SNRM2 -#define ssymv_ SSYMV -#define sdot_ SDOT -#define saxpy_ SAXPY -#define ssyr2_ SSYR2 -#define srot_ SROT -#define sgemv_ SGEMV -#define strsv_ STRSV -#define sgemm_ SGEMM -#define strsm_ STRSM - -#define dasum_ DASUM -#define idamax_ IDAMAX -#define dcopy_ DCOPY -#define dscal_ DSCAL -#define dger_ DGER -#define dnrm2_ DNRM2 -#define dsymv_ DSYMV -#define ddot_ DDOT -#define daxpy_ DAXPY -#define dsyr2_ DSYR2 -#define drot_ DROT -#define dgemv_ DGEMV -#define dtrsv_ DTRSV -#define dgemm_ DGEMM -#define dtrsm_ DTRSM - -#define scasum_ SCASUM -#define icamax_ ICAMAX -#define ccopy_ CCOPY -#define cscal_ CSCAL -#define scnrm2_ SCNRM2 -#define caxpy_ CAXPY -#define cgemv_ CGEMV -#define ctrsv_ CTRSV -#define cgemm_ CGEMM -#define ctrsm_ CTRSM -#define cgerc_ CGERC -#define chemv_ CHEMV -#define cher2_ CHER2 - -#define dzasum_ DZASUM -#define izamax_ IZAMAX -#define zcopy_ ZCOPY -#define zscal_ ZSCAL -#define dznrm2_ DZNRM2 -#define zaxpy_ ZAXPY -#define zgemv_ ZGEMV -#define ztrsv_ ZTRSV -#define zgemm_ ZGEMM -#define ztrsm_ ZTRSM - -#define zgerc_ ZGERC -#define zhemv_ ZHEMV -#define zher2_ ZHER2 -#define zgeru_ ZGERU - -/* LAPACK */ -#define strtri_ STRTRI -#define dtrtri_ DTRTRI -#define ctrtri_ CTRTRI -#define ztrtri_ ZTRTRI - -/* -#define mc64id_dist MC64ID_DIST -#define mc64ad_dist MC64AD_DIST -*/ -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_slugrid_ C_FORTRAN_SLUGRID -#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX -#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL -#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX -#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE -#define f_create_options_handle F_CREATE_OPTIONS_HANDLE -#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE -#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE -#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE -#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE -#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE -#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE -#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE -#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE -#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE -#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE -#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE -#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE -#define f_get_gridinfo F_GET_GRIDINFO -#define f_get_SuperMatrix F_GET_SUPERMATRIX -#define f_set_SuperMatrix F_SET_SUPERMATRIX -#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX -#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX -#define f_get_superlu_options F_GET_SUPERLU_OPTIONS -#define f_set_superlu_options F_SET_SUPERLU_OPTIONS -#define f_set_default_options F_SET_DEFAULT_OPTIONS -#define f_superlu_gridinit F_SUPERLU_GRIDINIT -#define f_superlu_gridmap F_SUPERLU_GRIDMAP -#define f_superlu_gridexit F_SUPERLU_GRIDEXIT -#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT -#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE -#define f_PStatInit F_PSTATINIT -#define f_PStatFree F_PSTATFREE -#define f_LUstructInit F_LUSTRUCTINIT -#define f_LUstructFree F_LUSTRUCTFREE -#define f_Destroy_LU F_DESTROY_LU -#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST -#define f_zCreate_CompRowLoc_Mat_dist F_ZCREATE_COMPROWLOC_MAT_DIST -#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST -#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST -#define f_dSolveFinalize F_DSOLVEFINALIZE -#define f_zSolveFinalize F_ZSOLVEFINALIZE -#define f_pdgssvx F_PDGSSVX -#define f_pzgssvx F_PZGSSVX -#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX -#define f_zcreate_dist_matrix F_ZCREATE_DIST_MATRIX -#define f_check_malloc F_CHECK_MALLOC -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 -#define zgeru_ zgeru - -/* LAPACK */ -#define strtri_ strtri -#define dtrtri_ dtrtri -#define ctrtri_ ctrtri -#define ztrtri_ ztrtri - -/* -#define mc64id_dist mc64id_dist -#define mc64ad_dist mc64ad_dist -*/ - -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_slugrid_ c_fortran_slugrid -#define c_fortran_pdgssvx_ c_fortran_pdgssvx -#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal -#define c_fortran_pzgssvx_ c_fortran_pzgssvx -#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle -#define f_create_options_handle f_create_options_handle -#define f_create_ScalePerm_handle f_create_scaleperm_handle -#define f_create_LUstruct_handle f_create_lustruct_handle -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle -#define f_create_SuperMatrix_handle f_create_supermatrix_handle -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle -#define f_destroy_options_handle f_destroy_options_handle -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle -#define f_create_SuperLUStat_handle f_create_superlustat_handle -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle -#define f_get_gridinfo f_get_gridinfo -#define f_get_SuperMatrix f_get_supermatrix -#define f_set_SuperMatrix f_set_supermatrix -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix -#define f_get_superlu_options f_get_superlu_options -#define f_set_superlu_options f_set_superlu_options -#define f_set_default_options f_set_default_options -#define f_superlu_gridinit f_superlu_gridinit -#define f_superlu_gridmap f_superlu_gridmap -#define f_superlu_gridexit f_superlu_gridexit -#define f_ScalePermstructInit f_scalepermstructinit -#define f_ScalePermstructFree f_scalepermstructfree -#define f_PStatInit f_pstatinit -#define f_PStatFree f_pstatfree -#define f_LUstructInit f_lustructinit -#define f_LUstructFree f_lustructfree -#define f_Destroy_LU f_destroy_lu -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist -#define f_dSolveFinalize f_dsolvefinalize -#define f_zSolveFinalize f_zsolvefinalize -#define f_pdgssvx f_pdgssvx -#define f_pzgssvx f_pzgssvx -#define f_dcreate_dist_matrix f_dcreate_dist_matrix -#define f_zcreate_dist_matrix f_zcreate_dist_matrix -#define f_check_malloc f_check_malloc -#endif - -#endif /* __SUPERLU_CNAMES */ diff --git a/SRC/Makefile b/SRC/Makefile index 18b21b01..19402bef 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -26,6 +26,12 @@ # ####################################################################### include ../make.inc + +# FACT3D = scatter.o +# pdgstrs_vecpar.o ancFactorization.o + +# pddrive_params.o + # # Precision independent routines # @@ -35,11 +41,9 @@ ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \ xerr_dist.o smach_dist.o dmach_dist.o \ superlu_dist_version.o TreeInterface.o - -ifeq ($(HAVE_CUDA),TRUE) -ALLAUX += gpublas_utils.o pdgstrs_lsum_cuda.o pdgstrs_lsum_cuda_dlink.o -endif - +# Following are from 3D code +ALLAUX += superlu_grid3d.o supernodal_etree.o supernodalForest.o \ + trfAux.o communication_aux.o treeFactorization.o sec_structs.o # # Routines literally taken from SuperLU, but renamed with suffix _dist # @@ -58,7 +62,12 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ pdgstrf.o dstatic_schedule.o pdgstrf2.o pdGetDiagU.o \ pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \ pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \ - dreadtriple_noheader.o + dreadtriple_noheader.o dsuperlu_blas.o +# from 3D code +DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \ + dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o \ + dnrformat_loc3d.o dtreeFactorizationGPU.o ##$(FACT3D) + # # Routines for double complex parallel SuperLU ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ @@ -68,7 +77,17 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ pzgstrf.o zstatic_schedule.o pzgstrf2.o pzGetDiagU.o \ pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \ pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ - zreadtriple_noheader.o + zreadtriple_noheader.o zsuperlu_blas.o +# from 3D code +ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \ + zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \ + znrformat_loc3d.o ztreeFactorizationGPU.o ##$(FACT3D) + +ifeq ($(HAVE_CUDA),TRUE) +ALLAUX += cublas_utils.o superlu_gpu_utils.o pdgstrs_lsum_cuda.o +DPLUSRC += dsuperlu_gpu.o +ZPLUSRC += zsuperlu_gpu.o +endif ifeq ($(HAVE_COMBBLAS),TRUE) DPLUSRC += d_c2cpp_GetHWPM.o @@ -124,13 +143,9 @@ pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c .c.o: $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE) -pdgstrs_lsum_cuda.o: pdgstrs_lsum_cuda.cu - $(NVCC) $(CUDACFLAGS) -dc $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c pdgstrs_lsum_cuda.cu $(VERBOSE) +.cu.o: + $(NVCC) $(NVCCFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) -pdgstrs_lsum_cuda_dlink.o: - $(NVCC) $(CUDACFLAGS) -dlink $(CUDALIBS) -o pdgstrs_lsum_cuda_dlink.o pdgstrs_lsum_cuda.o $(VERBOSE) - - .cpp.o: $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE) diff --git a/SRC/TreeInterface.cpp b/SRC/TreeInterface.cpp new file mode 100644 index 00000000..60c957c5 --- /dev/null +++ b/SRC/TreeInterface.cpp @@ -0,0 +1,337 @@ +#include "TreeReduce_slu.hpp" +#include "dcomplex.h" + +namespace SuperLU_ASYNCOMM{ + + + +#ifdef __cplusplus + extern "C" { +#endif + + BcTree BcTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed, char precision){ + assert(msgSize>0); + if(precision=='d'){ + TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (BcTree) BcastTree; + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (BcTree) BcastTree; + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (BcTree) BcastTree; + } + } + + void BcTree_Destroy(BcTree Tree, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + delete BcastTree; + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + delete BcastTree; + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + delete BcastTree; + } + + } + + void BcTree_SetTag(BcTree Tree, Int tag, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->SetTag(tag); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->SetTag(tag); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->SetTag(tag); + } + } + + + yes_no_t BcTree_IsRoot(BcTree Tree, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->IsRoot()?YES:NO; + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->IsRoot()?YES:NO; + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->IsRoot()?YES:NO; + } + } + + + void BcTree_forwardMessageSimple(BcTree Tree, void* localBuffer, Int msgSize, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((double*)localBuffer,msgSize); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((float*)localBuffer,msgSize); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); + } + } + + void BcTree_waitSendRequest(BcTree Tree, char precision) { + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } + } + + void BcTree_allocateRequest(BcTree Tree, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } + } + + int BcTree_getDestCount(BcTree Tree, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } + } + + int BcTree_GetMsgSize(BcTree Tree, char precision){ + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } + } + + StdList StdList_Init(){ + std::list* lst = new std::list(); + return (StdList) lst; + } + void StdList_Pushback(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; + list->push_back(dat); + } + + void StdList_Pushfront(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; + list->push_front(dat); + } + + int_t StdList_Popfront(StdList lst){ + std::list* list = (std::list*) lst; + int_t dat = -1; + if((*list).begin()!=(*list).end()){ + dat = (*list).front(); + list->pop_front(); + } + return dat; + } + + yes_no_t StdList_Find(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; + for (std::list::iterator itr = (*list).begin(); itr != (*list).end(); /*nothing*/){ + if(*itr==dat)return YES; + ++itr; + } + return NO; + } + + int_t StdList_Size(StdList lst){ + std::list* list = (std::list*) lst; + return list->size(); + } + + + yes_no_t StdList_Empty(StdList lst){ + std::list* list = (std::list*) lst; + return (*list).begin()==(*list).end()?YES:NO; + } + + + RdTree RdTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed, char precision){ + assert(msgSize>0); + if(precision=='d'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } + } + + void RdTree_Destroy(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } + } + + + void RdTree_SetTag(RdTree Tree, Int tag, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } + } + + int RdTree_GetDestCount(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } + } + + int RdTree_GetMsgSize(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } + } + + yes_no_t RdTree_IsRoot(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } + } + + void RdTree_forwardMessageSimple(RdTree Tree, void* localBuffer, Int msgSize, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((double*)localBuffer,msgSize); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((float*)localBuffer,msgSize); + } + if(precision=='z'){TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); + } + } + + void RdTree_allocateRequest(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } + } + + void RdTree_waitSendRequest(RdTree Tree, char precision){ + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } + } + +#ifdef __cplusplus + } +#endif + +} //namespace SuperLU_ASYNCOMM + diff --git a/SRC/acc_aux.c b/SRC/acc_aux.c new file mode 100644 index 00000000..d62d0c4e --- /dev/null +++ b/SRC/acc_aux.c @@ -0,0 +1,665 @@ +#include "acc_aux.h" + +#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) + +// int +// get_thread_per_process () +// { +// char *ttemp; +// ttemp = getenv ("THREAD_PER_PROCESS"); + +// if (ttemp) +// return atoi (ttemp); +// else +// return 1; +// } + + +static inline double +load_imb (double *A, int nthreads) +{ + int i; + double _mx, _avg; + _mx = 0; + _avg = 0; + for (i = 0; i < nthreads; i++) + { + _mx = (((_mx) > (A[i])) ? (_mx) : (A[i])); + _avg += A[i]; + } + _avg = _avg / (double) nthreads; + return _mx - _avg; +} + + + +// int_t +// get_max_buffer_size () +// { +// char *ttemp; +// ttemp = getenv ("MAX_BUFFER_SIZE"); +// if (ttemp) +// return atoi (ttemp); +// else +// return 5000000; +// } + + +// #define ACC_ASYNC_COST 3.79e-3 + +#define MAX_DIM 12800 +#define MAX_IN_DIM 256 +#define LOG_2_MAX_IN_DIM 8 +#define LOG_2_MAX_DIM 7 + + +double get_acc_async_cost() +{ + char *ttemp; + ttemp = getenv ("ACC_ASYNC_COST"); + if (ttemp) + return (double) atof (ttemp); + else + return 4e-3; +} + +// #define CPU_BANDWIDTH 35.0 + +double cpu_bandwidth; +int communication_overlap; +double acc_async_cost; + + +int_t fixed_partition; +double frac; + +/* Sherry: these lookup tables are not needed on Titan, nor Summit */ +double CpuDgemmLookUp[8][8][9]; +double PhiDgemmLookUp[8][8][9]; +double PhiBWLookUp[8]; // no used? +double MicPciBandwidth[18]; // no used? +double MicScatterBW[24][24]; + +#ifdef OFFLOAD_PROFILE +double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING]; +double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; +#endif + +static inline unsigned int next_power_2(unsigned int v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + + +static inline unsigned int previous_power_2(unsigned int v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v / 2; +} + + +#include +// static inline uint32_t my_log2(const uint32_t x) { +// uint32_t y; +// asm ( "\tbsr %1, %0\n" +// : "=r"(y) +// : "r" (x) +// ); +// return y; +// } + +static inline uint32_t my_log2(const uint32_t x) +{ + return (uint32_t) log2((double) x); +} + +static inline unsigned int nearst_2_100(unsigned int v) +{ + v = (v + 99) / 100; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return SUPERLU_MIN(my_log2(v), LOG_2_MAX_DIM) ; +} + +static inline unsigned int nearst_k(unsigned int v) +{ + + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return SUPERLU_MIN(my_log2(v), LOG_2_MAX_IN_DIM) ; +} + + + +double estimate_acc_time(int m, int n , int k) +{ + double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)]; + double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate); + + double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))]; + + double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ; + // printf("gemm_time %.2e scatter_time %.2e, flop_rate %lf mop_rate %lf ",gemm_time, scatter_time, flop_rate,mop_rate); + if (gemm_time < 0) + { + /* code */ + printf(" m %d n %d k %d \n", m, n, k); + exit(0); + } + + double eta = 1; /*to allow more computations on ACC only applicable for MPI cases*/ + // if(m>1024 && k>32) eta=1.5; + if (communication_overlap) + { + if (m > 2048 && k > 32) eta = 5.0; + if (m > 4096 && k > 32) eta = 6.0; + if (m > 4096 && k > 64) eta = 8.0; + } + + + return (gemm_time + scatter_time) / eta; +} + + + +double estimate_acc_gemm_time(int m, int n , int k) +{ + double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)]; + double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate); + + + double eta = 1; /*to allow more computations on ACC only applicable for MPI cases*/ + // if(m>1024 && k>32) eta=1.5; + if (communication_overlap) + { + if (m > 2048 && k > 32) eta = 5.0; + if (m > 4096 && k > 32) eta = 6.0; + if (m > 4096 && k > 64) eta = 8.0; + } + + + return (gemm_time) / eta; +} + + +double estimate_acc_scatter_time(int m, int n , int k) +{ + + double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))]; + + double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ; + + double eta = 1; /*to allow more computations on ACC only applicable for MPI cases*/ + // if(m>1024 && k>32) eta=1.5; + if (communication_overlap) + { + if (m > 2048 && k > 32) eta = 5.0; + if (m > 4096 && k > 32) eta = 6.0; + if (m > 4096 && k > 64) eta = 8.0; + } + + + return (scatter_time) / eta; +} + +double estimate_cpu_time(int m, int n , int k) +{ + if (m == 0 || n == 0 || k == 0) + { + return 0; + } + double flop_rate = CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)]; + double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate); + double scatter_time = 24e-9 * (double) m * (double)n / cpu_bandwidth ; + return gemm_time + scatter_time; +} + + +double acc_data_send_time(size_t sz) +{ + if (my_log2((sz + 999) / 1000) > 17 ) return 1e-9 * (double) sz / MicPciBandwidth[17]; + return 1e-9 * (double) sz / MicPciBandwidth[my_log2((sz + 999) / 1000)]; +} + + +void LookUpTableInit(int my_rank) +{ + char *ttemp; + char buffer[1024]; + char *line; + FILE *fp; + + ttemp = getenv("CPU_BANDWIDTH"); + if (ttemp) + { + cpu_bandwidth = atof(ttemp); +#ifdef GPU_DEBUG + if (!my_rank) printf("Bandwidth of CP is %lf \n", cpu_bandwidth ); +#endif + } + else + { + printf("Please set CPU_BANDWIDTH : bbye\n"); + exit(0); + + } + + // ttemp = getenv("SLU_ACC_COMM_OVERLAP"); + // if (ttemp) + // { + // communication_overlap = atoi(ttemp); + // if (!my_rank && communication_overlap ) printf("Using communication computation overlap version\n"); + // } + // else + // { + // printf("Please set SLU_ACC_COMM_OVERLAP : bbye\n"); + // exit(0); + // } + + + /*Reading CPU performance table */ + ttemp = getenv("CPU_DGEMM_PERF_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + double max_flop_rate = 0; + if (!fp) + { + if (!my_rank) printf("can not open %s: exiting \n", ttemp); + exit(0); + } + + while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL) + { + + int m, n, k; + double flop_rate; + sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate); + CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate; + max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate); + } + fclose(fp); + // printf("CPU: MAX FLOP Rate %lf GFLOP/Sec\n",max_flop_rate ); + } + else + { + printf("Assign CPU performance table \n"); + exit(0); + } + + ttemp = getenv("ACC_DGEMM_PERF_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + if (!fp) + { + printf("can not open %s: exiting \n", ttemp); + exit(0); + } + double max_flop_rate = 0; + while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL) + { + + int m, n, k; + double flop_rate; + sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate); + PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate; + max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate); + } + fclose(fp); +#ifdef GPU_DEBUG + if (!my_rank) printf("ACC: MAX FLOP Rate %lf GFLOP/Sec\n", max_flop_rate ); +#endif + } + else + { + printf("Assign ACC DGEMM performance table \n"); + exit(0); + } + + ttemp = getenv("ACC_SCATTER_PERF_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + double max_mop_rate = 0; + while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL) + { + + int m; + double mop_rate, sd; + sscanf(line, "%d, %lf, %lf", &m, &mop_rate, &sd); + PhiBWLookUp[nearst_2_100(m)] = mop_rate; + max_mop_rate = SUPERLU_MAX(mop_rate, max_mop_rate); + } + fclose(fp); +#ifdef GPU_DEBUG + if (!my_rank) printf("ACC: MAX MOP Rate %lf GFLOP/Sec\n", max_mop_rate ); +#endif + } + else + { + printf("Assign ACC DGEMM performance table \n"); + exit(0); + } + + + ttemp = getenv("ACC_PCI_BW_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + + while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL) + { + + int m; + double bw; + sscanf(line, "%d,%lf", &m, &bw); + MicPciBandwidth[my_log2(m / 1000)] = bw; + + } + fclose(fp); + } + else + { + printf("Assign ACC_PCI_BW_TABLE \n"); + exit(0); + } + + ttemp = getenv("ACC_SCATTER_BW_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + + + for (int i = 0; i < 24; ++i) + { + for (int j = 0; j < 24; ++j) + { + fscanf(fp, "%lf", &MicScatterBW[i][j]); + // printf("%d %d %lf\n",i,j,MicScatterBW[i][j] ); + } + } + + + fclose(fp); + } + else + { + printf("Assign ACC_SCATTER_BW_TABLE \n"); + exit(0); + } + +#ifdef OFFLOAD_PROFILE + ttemp = getenv("ACC_SCATTER_TLI_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + double max_mop_rate = 0; + + for (int i = 0; i < MAX_BLOCK_SIZE / STEPPING; ++i) + { + for (int j = 0; j < MAX_BLOCK_SIZE / STEPPING; ++j) + { + + fscanf(fp, "%lf", &MicScatterTLI[i][j]); + if (MicScatterTLI[i][j] > 2) + { + MicScatterTLI[i][j] = 2; + } + // printf("%lf \n", MicScatterTLI[i][j]); + } + } + + + fclose(fp); + } + else + { + printf("ACC_SCATTER_TLI_TABLE \n"); + exit(0); + } + + ttemp = getenv("HOST_SCU_PERF_TABLE"); + if (ttemp) + { + fp = fopen(ttemp, "r"); + for (int_t k = 0; k < CBLOCK / CSTEPPING; ++k) + { + + for (int_t i = 0; i < CBLOCK / CSTEPPING; ++i) + { + for (int_t j = 0; j < CBLOCK / CSTEPPING; ++j) + { + fscanf(fp, "%lf", &host_scu_flop_rate[k][i][j]); + + } + + } + } + fclose(fp); + } + else + { + printf("please assign HOST_SCU_PERF_TABLE \n"); + exit(0); + } + +#endif + + ttemp = getenv("FIXED_PARTITION"); + if (ttemp) + { + fixed_partition = atoi(ttemp); + if (fixed_partition) + { + printf("Using fixed workload partition \n"); + ttemp = getenv("CPU_ACC_WORK_PARTITION"); + if (ttemp) + { + frac = atof (ttemp); + } + else + { + frac = 1; + } + + } + + } + else + { + fixed_partition = 0; + } + +} /* end LookupTableInit */ + + +double l_count[24]; /*used for keeping entries*/ +double u_count[24]; /*for keeping u entries*/ + +double +estimate_acc_scatter_time_strat1(Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb ) +{ + for (int i = 0; i < 24; ++i) + { + l_count[i] = 0; + u_count[i] = 0; + } + + int_t cum_nrows = 0; + int_t cum_ncols = 0; + for (int i = 0; i < nub; ++i) + { + int_t ncols = Ublock_info[i].ncols; + int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0); + u_count[ind] += (double) ncols; + cum_ncols += ncols; + + } + + + for (int i = 0; i < nlb; ++i) + { + int_t nrows = Lblock_info[i].nrows; + int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0); + l_count[ind] += (double) nrows; + + cum_nrows += nrows; + + } + + double ttime = 0; + for (int i = 0; i < 24; ++i) + { + for (int j = 0; j < 24; ++j) + { + /* code */ + ttime += 8 * 3e-9 * l_count[i] * u_count[j] / MicScatterBW[i][j]; + } + } + + // ttime *= (MicScatterTLI[CLAMP( CEILING(cum_nrows, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )][CLAMP( CEILING(cum_ncols, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )]) ; + ttime *= SUPERLU_MIN(nub * nlb / 240 , 1); + return ttime; +} + +#ifdef OFFLOAD_PROFILE +/*following is a good strategy; gives good prediction but for some reason I do not get over all performance +improvement so I've ommited this thing out*/ +double +estimate_cpu_sc_time_strat1(int_t ldu, Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb ) +{ + int_t ind_k = SUPERLU_MAX(CEILING(ldu, 8) - 1, 0); + for (int i = 0; i < 24; ++i) + { + l_count[i] = 0; + u_count[i] = 0; + } + + int_t cum_nrows = 0; + int_t cum_ncols = 0; + for (int i = 0; i < nub; ++i) + { + int_t ncols = Ublock_info[i].ncols; + int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0); + u_count[ind] += (double) ncols; + cum_ncols += ncols; + + } + + + for (int i = 0; i < nlb; ++i) + { + int_t nrows = Lblock_info[i].nrows; + int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0); + l_count[ind] += (double) nrows; + cum_nrows += nrows; + } + + double ttime = 0; + for (int i = 0; i < 24; ++i) + { + for (int j = 0; j < 24; ++j) + { + /* flop rate is in gf/sec */ + ttime += 2e-9 * ldu * l_count[i] * u_count[j] / host_scu_flop_rate[ind_k][i][j]; + } + } + + return ttime; +} + +#endif + +/* Sherry: this routine is not called */ +int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi) +{ + int_t acc_cols, cpu_cols; + int_t total_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols; + if (frac == 0) + { + return num_u_blks_Phi; + } + else if (frac == 1) + { + return 0; + } + + for (int_t j = num_u_blks_Phi - 1; j > -1; --j) // ### + { + + acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ; + cpu_cols = total_cols - acc_cols; + + if (estimate_acc_time (Rnbrow, acc_cols, ldu_Phi) < acc_async_cost) + { + break; + } + if (cpu_cols > frac * total_cols ) + { + return j; + } + + } + + return 0; +} + + +/* Partition the "num_u_blks_Phi" portion into GPU and CPU part, + based on the estimated computational cost on CPU and GPU. + This was useful for the old Intel Phi architecture, but for the + new architecture, such as Titan and Summit, we can give everything + to GPU. +*/ +int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info, int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi ) +{ + double cpu_time, acc_time; + int_t acc_cols, cpu_cols; + + for (int_t j = num_u_blks_Phi - 1; j > -1; --j) // ### + { + + acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ; + cpu_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols - acc_cols; + acc_time = estimate_acc_scatter_time_strat1(&Ublock_info_Phi[0], j, + Remain_info, RemainBlk ) + estimate_acc_gemm_time(Rnbrow, acc_cols, ldu_Phi); + + cpu_time = estimate_cpu_time(Rnbrow, cpu_cols, ldu_Phi) + cpu_time_0; + + + // SCT.Predicted_host_sch_time[k0] = cpu_time_without_offload; + if (cpu_time > acc_time + acc_async_cost) + { + return j; + + } + } + + return 0; /*default value is zero */ +} + + diff --git a/SRC/acc_aux.h b/SRC/acc_aux.h new file mode 100644 index 00000000..b63ca119 --- /dev/null +++ b/SRC/acc_aux.h @@ -0,0 +1,48 @@ +#pragma once + +// #include "pdgstrf.h" + + +typedef struct mdwin_t +{ + double cpu_bandwidth; + int communication_overlap; + double acc_async_cost; + + + int_t fixed_partition; + double frac; + + double CpuDgemmLookUp[8][8][9]; + double PhiDgemmLookUp[8][8][9]; + double PhiBWLookUp[8]; + double MicPciBandwidth[18]; + double MicScatterBW[24][24]; + +#ifdef OFFLOAD_PROFILE + double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING]; + double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; +#endif +} mdwin_t; + +int_t +get_max_buffer_size (); + +double get_acc_async_cost(); + +double estimate_acc_time(int m, int n , int k); + +double estimate_acc_gemm_time(int m, int n , int k); + +double estimate_acc_scatter_time(int m, int n , int k); + +double estimate_cpu_time(int m, int n , int k); + +double acc_data_send_time(size_t sz); + +void LookUpTableInit(int my_rank); + + +int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi); +int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info, + int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi ); \ No newline at end of file diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c new file mode 100644 index 00000000..ff0034fc --- /dev/null +++ b/SRC/communication_aux.c @@ -0,0 +1,236 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Auxiliary routines to support communication in 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + +#include "superlu_defs.h" +#if 0 +#include "sec_structs.h" +#include "communication_aux.h" +#include "compiler.h" +#endif + + +int_t Wait_LSend +/*wait till broadcast of L finished*/ +(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *send_req, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t lk = LBj (k, grid); + int_t mycol = MYCOL (iam, grid); + MPI_Status status; + for (int_t pj = 0; pj < Pc; ++pj) + { + /* Wait for Isend to complete before using lsub/lusup. */ + if (ToSendR[lk][pj] != EMPTY && pj != mycol) + { + MPI_Wait (&send_req[pj], &status); + MPI_Wait (&send_req[pj + Pc], &status); + } + } + SCT->Wait_LSend_tl += ( SuperLU_timer_() - t1); + return 0; +} + + +int_t Wait_USend +/*wait till broadcast of U panels finished*/ +( MPI_Request *send_req, gridinfo_t *grid, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Status status; + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + MPI_Wait (&send_req[pi], &status); + MPI_Wait (&send_req[pi + Pr], &status); + } + } + SCT->Wait_USend_tl += (double) (SuperLU_timer_() - t1); + return 0; +} + + +int_t Check_LRecv +/*checks if diagnoal blocks have been received*/ + +( MPI_Request* recv_req, int* msgcnt ) +{ + int flag0, flag1; + MPI_Status status; + + flag0 = flag1 = 0; + if (recv_req[0] != MPI_REQUEST_NULL) + { + MPI_Test (&recv_req[0], &flag0, &status); + if (flag0) + { + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + } + else + flag0 = 1; + if (recv_req[1] != MPI_REQUEST_NULL) + { + MPI_Test (&recv_req[1], &flag1, &status); + if (flag1) + { + MPI_Get_count (&status, mpi_int_t, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + } + else + flag1 = 1; + + return flag1 && flag0; +} + + +int_t Wait_UDiagBlockSend(MPI_Request *U_diag_blk_send_req, + gridinfo_t * grid, SCT_t* SCT) +{ + + double t1 = SuperLU_timer_(); + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Status status; + for (int_t pr = 0; pr < Pr; ++pr) + { + if (pr != myrow) + { + MPI_Wait (U_diag_blk_send_req + pr, &status); + } + } + SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1); + return 0; +} + +int_t Wait_LDiagBlockSend(MPI_Request *L_diag_blk_send_req, + gridinfo_t * grid, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + MPI_Status status; + for (int_t pc = 0; pc < Pc; ++pc) + { + if (pc != mycol) + { + MPI_Wait (L_diag_blk_send_req + pc, &status); + } + } + SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1); + return 0; +} + + +int_t Wait_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + MPI_Status status; + MPI_Wait(request, &status); + SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1); + return 0; +} + +int_t Test_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + MPI_Status status; + int flag; + MPI_Test(request,&flag, &status); + SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1); + return flag; + +} + +int_t Wait_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + MPI_Status status; + MPI_Wait(request, &status); + SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1); + return 0; + +} + +int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + double t1 = SuperLU_timer_(); + MPI_Status status; + int flag; + MPI_Test(request, &flag, &status); + SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1); + return flag; +} + +/* + * The following are from trfCommWrapper.c. + */ +int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, + MPI_Request *L_diag_blk_send_req, + gridinfo_t *grid, SCT_t *SCT) +{ + // Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + // LocalLU_t *Llu = LUstruct->Llu; + // int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + if (iam == pkk) + { + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + Wait_LDiagBlockSend(L_diag_blk_send_req, grid, SCT); + } + + return 0; +} + + + +int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, + MPI_Request * L_diag_blk_recv_req, + gridinfo_t *grid) +{ + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + + /*factor the U panel*/ + if (myrow == krow && factored_U[k] == 0 && iam != pkk) + { + factored_U[k] = 1; + MPI_Status status; + MPI_Wait(L_diag_blk_recv_req, &status); + } + return 0; +} + diff --git a/SRC/dHWPM_CombBLAS.hpp b/SRC/dHWPM_CombBLAS.hpp index 53370d39..9d8b4bc1 100644 --- a/SRC/dHWPM_CombBLAS.hpp +++ b/SRC/dHWPM_CombBLAS.hpp @@ -73,7 +73,7 @@ dGetHWPM(SuperMatrix *A, gridinfo_t *grid, dScalePermstruct_t *ScalePermstruct) { printf("HWPM only supports square process grid. Retuning without a permutation.\n"); } - combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc; + combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc(grid->comm); std::vector< std::vector < std::tuple > > data(procs); /* ------------------------------------------------------------ diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c index bb842852..cdf0dc21 100644 --- a/SRC/dbinary_io.c +++ b/SRC/dbinary_io.c @@ -18,6 +18,7 @@ dread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp); printf("# of doubles fread: %d\n", nnz_read); fclose(fp); + return 0; } int @@ -27,7 +28,7 @@ dwrite_binary(int_t n, int_t nnz, FILE *fp1; int nnz_written; size_t isize = sizeof(int_t), dsize = sizeof(double); - fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb"); + fp1 = fopen("matrix.bin", "wb"); fwrite(&n, isize, 1, fp1); fwrite(&nnz, isize, 1, fp1); fwrite(colptr, isize, n+1, fp1); @@ -37,4 +38,5 @@ dwrite_binary(int_t n, int_t nnz, printf("dump binary file ... # of double fwrite: %d\n", nnz_written); assert(nnz_written==nnz); fclose(fp1); + return 0; } diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c new file mode 100644 index 00000000..ef9d6da6 --- /dev/null +++ b/SRC/dcommunication_aux.c @@ -0,0 +1,503 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Communication routines. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_ddefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t dIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], MPI_DOUBLE, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t dIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], MPI_DOUBLE,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     double* uval, gridinfo_t *grid,
+		    int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], MPI_DOUBLE, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, double* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, dLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t dIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, double* Uval_buf, dLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t dWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, MPI_DOUBLE, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, MPI_DOUBLE, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, dLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    double *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(double) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, MPI_DOUBLE, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, MPI_DOUBLE, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, MPI_DOUBLE, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, MPI_DOUBLE, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, MPI_DOUBLE, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t dUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/dgather.c b/SRC/dgather.c
new file mode 100644
index 00000000..0c618e42
--- /dev/null
+++ b/SRC/dgather.c
@@ -0,0 +1,398 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Various gather routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include 
+#include "superlu_ddefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t dprintMatrix(char*s, int n, int m, double* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; ixsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    dgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    dgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // dprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // dprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void dRgather_U( int_t k, int_t jj0, int_t *usub,	double *uval,
+                 double *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    dgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    dgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* dRgather_U */
diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h
new file mode 100644
index 00000000..d439bd2d
--- /dev/null
+++ b/SRC/dlustruct_gpu.h
@@ -0,0 +1,238 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +#include "superlu_ddefs.h" + +#ifdef GPU_ACC // enable GPU +#include "gpublas_utils.h" + +// #include "mkl.h" +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NGPU_STREAMS 32 + +static +void check(gpuError_t result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n", + file, line, gpuGetErrorString(result), func); + + // Make sure we call GPU Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + double *bigV; + double *bigU; + double *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + double *Remain_L_buff; /* on GPU */ + double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} dSCUbuf_gpu_t; + +/* Holds the L & U data structures on the GPU side */ +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + double *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + double *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + dSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS]; + double *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*gpu events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *ePCIeH2D; + gpuEvent_t *ePCIeD2H_Start; + gpuEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} dLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + dLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU + gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream; + gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS]; + int_t lastOffloadStream[MAX_NGPU_STREAMS]; + int_t nGPUStreams; + int* isNodeInMyGrid; + double acc_async_cost; +} dsluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int dsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + dscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + dLUValSubBuf_t **LUvsbs, // size = num Look ahead + ddiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + dsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +int dinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + dsluGPU_t *sluGPU, + gridinfo_t *grid, + dLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int dreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + dLUstruct_t *LUstruct); + +extern int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT); +extern int dsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, dsluGPU_t *sluGPU); +extern int dsendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, double* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, dsluGPU_t *sluGPU, HyP_t* HyP +); + +extern int dinitSluGPU3D_t( + dsluGPU_t *sluGPU, + dLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int dSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + dsluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void dCopyLUToGPU3D (int* isNodeInMyGrid, dLocalLU_t *A_host, + dsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, dLUValSubBuf_t*LUvsb, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + dsluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT); +extern int dfree_LUstruct_gpu (dLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(dsluGPU_t *sluGPU); + +extern void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda ); + +/*to print out various statistics*/ +void dprintGPUStats(dLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +#endif // matching: enable GPU diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c index 9d047273..5b722ac9 100644 --- a/SRC/dmemory_dist.c +++ b/SRC/dmemory_dist.c @@ -170,8 +170,6 @@ double *doubleCalloc_dist(int_t n) return (buf); } -#if 0 //////////////// Sherry - /*************************************** * The following are from 3D code. ***************************************/ @@ -286,4 +284,3 @@ void d3D_printMemUse( trf3Dpartition_t* trf3Dpartition, dLUstruct_t *LUstruct, } } -#endif diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c new file mode 100644 index 00000000..97dac6a6 --- /dev/null +++ b/SRC/dnrformat_loc3d.c @@ -0,0 +1,575 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void dGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ double *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	double *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	double *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+		    b_counts_int, b_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    double *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    double *Btmp; // on 3D grid
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
+		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_DOUBLE,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* dScatter_B3d */
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
index 53b02ab0..8be6ba3c 100644
--- a/SRC/dreadMM.c
+++ b/SRC/dreadMM.c
@@ -61,7 +61,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
@@ -108,7 +108,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 3/ Read n and nnz */
 #ifdef _LONGINT
-    sscanf(line, "%ld%ld%ld",m, n, nonz);
+    sscanf(line, "%lld%lld%lld", m, n, nonz);
 #else
     sscanf(line, "%d%d%d",m, n, nonz);
 #endif
diff --git a/SRC/dreadtriple.c b/SRC/dreadtriple.c
index 8053e69a..523e2596 100644
--- a/SRC/dreadtriple.c
+++ b/SRC/dreadtriple.c
@@ -47,7 +47,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
 #ifdef _LONGINT
-    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+    fscanf(fp, "%lld%lld%lld", m, n, nonz);
 #else
     fscanf(fp, "%d%d%d", m, n, nonz);
 #endif
@@ -77,7 +77,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 
 #ifdef _LONGINT
-        fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+        fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
 #else // int 
         fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
 #endif
@@ -86,8 +86,9 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
-	    } else
+	    } else {
 		printf("triplet file: row/col indices are one-based.\n");
+     	    }
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c
index 09004af4..fce23583 100644
--- a/SRC/dreadtriple_noheader.c
+++ b/SRC/dreadtriple_noheader.c
@@ -49,9 +49,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     nz = *n = 0;
 
 #ifdef _LONGINT
-    ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+    ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali);
 #else  // int
-    ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali);
+    ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
 #endif
 
     while (ret_val != EOF) {
@@ -62,9 +62,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	++nz;
 
 #ifdef _LONGINT
-        ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali);
+        ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali);
 #else  // int
-        ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali);
+        ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali);
 #endif
     }
     
@@ -105,7 +105,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* Read into the triplet array from a file */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]);
+	fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]);
 #else // int32
 	fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]);
 #endif
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
index c167e19e..dbddd550 100644
--- a/SRC/dscatter.c
+++ b/SRC/dscatter.c
@@ -306,7 +306,7 @@ gemm_division_cpu_gpu(
 )
 {
     int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
@@ -433,7 +433,7 @@ gemm_division_new (int * num_streams_used,   /*number of streams that will be us
     )
 {
     int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c
new file mode 100644
index 00000000..53af7944
--- /dev/null
+++ b/SRC/dscatter3d.c
@@ -0,0 +1,625 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, double *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+dblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int ldl,
+                    double *U_mat, int ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    double alpha = 1.0, beta = 0.0;
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        dscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+dblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling DGEMM */
+    superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        dscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 dLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dgemm_scatterTopLeft */
+
+int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  dLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dblock_gemm_scatterTopRight */
+
+int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    dLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomLeft */
+
+int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     dLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
new file mode 100644
index 00000000..9b4cb79f
--- /dev/null
+++ b/SRC/dsuperlu_blas.c
@@ -0,0 +1,123 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_ddefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a,
+                  int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    dgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return dgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    dtrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return dtrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    dger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_dscal(const int n, const double alpha, double *x, const int incx)
+{
+    dscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_daxpy(const int n, const double alpha,
+    const double *x, const int incx, double *y, const int incy)
+{
+    daxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_dgemv(const char *trans, const int m,
+                  const int n, const double alpha, const double *a,
+                  const int lda, const double *x, const int incx,
+                  const double beta, double *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                  int n, double *a, int lda, double *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
new file mode 100644
index 00000000..51a5b7b4
--- /dev/null
+++ b/SRC/dsuperlu_gpu.cu
@@ -0,0 +1,1780 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * 
+ */ + +//#define GPU_DEBUG + +#include "superlu_defs.h" + +#undef Reduce + +//#include + +#include "dlustruct_gpu.h" + + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +// gpublasStatus_t checkGPUblas(gpublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != GPUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result)); +// assert(result == GPUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + double *nzval, int_t ldv, + double *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ + +__device__ inline +void ddevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + double *nzval, int ldv, + const double *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + nzval[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +/* Sherry: this routine is not used */ +#if 0 ////////////////////////////////////////////// +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif /////////////////////////////////// not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + double * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + double * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + dLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + double *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + double *LnzvalVec = A_gpu->LnzvalVec; + double *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + //int THREAD_BLOCK_SIZE =ldt; + + int* pfxStorage = (int*) &IndirectJ3[ldt]; + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + + double *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + double *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = blockDim.x / temp_nbrow; + + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) + IndirectJ1[thread_id] = 0; + + if (thread_id < blockDim.x) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_lptr, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + double *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_lptr[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = blockDim.x / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + ddevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int dSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + dsluGPU_t *sluGPU, gridinfo_t *grid +) +{ + int SCATTER_THREAD_BLOCK_SIZE=512; + + dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + dLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + double * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("dSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(double), + gpuMemcpyHostToDevice, FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(double), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double) + + bigu_send_size * sizeof(double) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + double alpha = 1.0, beta = 0.0; + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + gpublasSetStream(gpublas_handle0, FunCallStream); + gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream); + gpublasDgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N, + nrows, ncols, ldu, &alpha, + &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, + &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, + &beta, A_gpu->scubufs[streamId].bigV, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end dSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(gpuDeviceProp devProp) +{ + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) +{ + /* Free the L data structure on GPU */ + checkGPU(gpuFree(A_gpu->LrowindVec)); + checkGPU(gpuFree(A_gpu->LrowindPtr)); + + checkGPU(gpuFree(A_gpu->LnzvalVec)); + checkGPU(gpuFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + + /*freeing the pinned memory*/ + int_t streamId = 0; + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkGPU(gpuFreeHost(A_gpu->acc_L_buff)); + checkGPU(gpuFreeHost(A_gpu->acc_U_buff)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + + /* Free the U data structure on GPU */ + checkGPU(gpuFree(A_gpu->UrowindVec)); + checkGPU(gpuFree(A_gpu->UrowindPtr)); + + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated + + checkGPU(gpuFree(A_gpu->UnzvalVec)); + checkGPU(gpuFree(A_gpu->UnzvalPtr)); + + checkGPU(gpuFree(A_gpu->grid)); + + /* Free the Schur complement structure on GPU */ + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkGPU(gpuFree(A_gpu->indirect)); + // checkGPU(gpuFree(A_gpu->indirect2)); + checkGPU(gpuFree(A_gpu->xsup)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].usub)); + + checkGPU(gpuFree(A_gpu->local_l_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->jib_lookupVec)); + checkGPU(gpuFree(A_gpu->jib_lookupPtr)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->ijb_lookupVec)); + checkGPU(gpuFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) +{ + double *cPtr = (double *) malloc(sizeof(double) * lda * n); + checkGPU(gpuMemcpy( cPtr, dA, + lda * n * sizeof(double), gpuMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void dprintGPUStats(dLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + +/* Initialize the GPU side of the data structure. */ +int dinitSluGPU3D_t( + dsluGPU_t *sluGPU, // LU structures on GPU, see dlustruct_gpu.h + dLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + (gpuDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + dLocalLU_t *Llu = LUstruct->Llu; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nGPUStreams = getnGPUStreams(); + + int SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) + { + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + + } + + if (grid3d->iam == 0) + { + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams); + fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + gpuStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + gpuStreamCreate(&(sluGPU->funCallStreams[streamId])); + gpublasCreate(&(sluGPU->gpublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (dLUstruct_gpu_t *) malloc (sizeof(dLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ + dCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end dinitSluGPU3D_t */ + + +int dinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + dsluGPU_t *sluGPU, gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* dinitD2Hreduce */ + +int dreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + dLUstruct_t *LUstruct +) +{ + dLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + gpuStream_t CopyStream; + dLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + double *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + double alpha = 1; + superlu_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + double *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + double alpha = 1; + superlu_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* dreduceGPUlu */ + + +int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int dsendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + gpuStream_t CopyStream = sluGPU->CopyStream;; + dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(double), gpuMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(double), gpuMemcpyDeviceToHost, CopyStream ) ); + gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(double) + l_copy_len * sizeof(double); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int dsendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + double* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + dsluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] dsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in dsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(double)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(double)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(dsluGPU_t *sluGPU) +{ + return 0; +} +#endif + +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling dfree_LUsstruct_gpu(). */ +void dCopyLUToGPU3D ( + int* isNodeInMyGrid, + dLocalLU_t *A_host, /* distributed LU structure on host */ + dsluGPU_t *sluGPU, /* hold LU structure on GPU */ + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + dLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + gpuDeviceProp devProp; + gpuGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nGPUStreams = sluGPU->nGPUStreams; + /*pinned memory allocations. + Paged-locked memory by gpuMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nGPUStreams; streamId++ ) { + void *tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkGPUErrors(gpuMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; + + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (double *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (double *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(double); + checkGPUErrors(gpuMalloc( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(double); + checkGPUErrors(gpuMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, buffer_size * sizeof(double))) ; + A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(double); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->GemmEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ScatterEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(double) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(double) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(double) * 1e-9, + u_val_len * sizeof(double) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_val_len * sizeof(double))); + A_gpu->LnzvalVec = (double *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(double))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_val_len * sizeof(double))); + A_gpu->UnzvalVec = (double *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(double))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, sizeof(dLUstruct_gpu_t))) ; + *dA_gpu = (dLUstruct_gpu_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(dLUstruct_gpu_t), gpuMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end dCopyLUToGPU3D */ + + + +int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + dLUValSubBuf_t*LUvsb, + dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + dsluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all gpu streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + dinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + dreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* dreduceAllAncestors3d_GPU */ + + +void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} diff --git a/SRC/dsuperlu_gpu.hip.cpp b/SRC/dsuperlu_gpu.hip.cpp new file mode 100644 index 00000000..b71efe5a --- /dev/null +++ b/SRC/dsuperlu_gpu.hip.cpp @@ -0,0 +1 @@ +#include "dsuperlu_gpu.cu" \ No newline at end of file diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c new file mode 100644 index 00000000..b7fbdbff --- /dev/null +++ b/SRC/dtreeFactorization.c @@ -0,0 +1,763 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_ddefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t dLluBufInit(dLUValSubBuf_t* LUvsb, dLUstruct_t *LUstruct)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    ddiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (ddiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(ddiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (ddiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(ddiagFactBufs_t));
+        assert(dFBufs[i]);
+        dinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct)
+{
+    dLUValSubBuf_t** LUvsbs = (dLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(dLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (dLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+        dLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  dscuBufs_t* scuBufs,
+                  dLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = dgetBigV(ldt, num_threads);
+    scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int dfreeScuBufs(dscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t ddenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    dLUValSubBuf_t* LUvsb,
+    ddiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	dDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	dIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	dUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	dIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	dWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	dWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= dSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            double* bigV = scuBufs->bigV;
+            double* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                double** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                double *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                double *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                dblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* ddenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t dsparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    dscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dsparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = dSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            double* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    dLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    dUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit dsparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* dsparseTreeFactor_ASYNC */
diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c
new file mode 100644
index 00000000..e2b66c34
--- /dev/null
+++ b/SRC/dtreeFactorizationGPU.c
@@ -0,0 +1,758 @@
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "dlustruct_gpu.h" +//#include "cblas.h" + +#ifdef GPU_ACC ///////////////// enable GPU + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +#if 0 +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} +#endif + +int dsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + dscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + dLUValSubBuf_t **LUvsbs, // size = num Look ahead + ddiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + dsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams + + if (superlu_acc_offload) + dsyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + dinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + dinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = dSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + double *bigV = scuBufs->bigV; + double *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#ifdef _OPENMP +#pragma omp parallel /* Look-ahead update on CPU */ +#endif + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + dinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + dreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#ifdef _OPENMP +#pragma omp parallel +#endif + { + /* Master thread performs Schur complement update on GPU. */ +#ifdef _OPENMP +#pragma omp master +#endif + { + if (superlu_acc_offload) + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nGPUStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + dwaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + dsendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + dSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#ifdef _OPENMP +#pragma omp for +#endif + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end dsparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c new file mode 100644 index 00000000..30c98fc3 --- /dev/null +++ b/SRC/dtrfAux.c @@ -0,0 +1,757 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Auxiliary routine for 3D factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
+#include "superlu_ddefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* dInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, dLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    dinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t dSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    double *bigU,
+    int_t* Lsub_buf,
+    double *Lval_buf,
+    int_t* Usub_buf,
+    double *Uval_buf,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    double* uval;
+    int_t* lsub;
+    double* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        dgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* dSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t dSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    dscuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, dLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    double * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    double* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    double* uval;
+    int_t* lsub;
+    double* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        dRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            dRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* dSchurComplementSetupGPU */
+
+
+double* dgetBigV(int_t ldt, int_t num_threads)
+{
+    double *bigV;
+    if (!(bigV = doubleMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+double* dgetBigU(int_t nsupers, gridinfo_t *grid, dLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    double* bigU = doubleMalloc_dist(bigu_size);
+
+    return bigU;
+} /* dgetBigU */
+
+
+trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      dLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter dinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    dinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+    dLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* dinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit dDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
new file mode 100644
index 00000000..c7b51a54
--- /dev/null
+++ b/SRC/dtrfCommWrapper.c
@@ -0,0 +1,548 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t dDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     double *BlockUFactor,
+                     double *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     dLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        dIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        dIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Dgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        dPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        dISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        dISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
+		      double* BlockUFactor,
+		      gridinfo_t *grid,
+		      dLUstruct_t *LUstruct)
+{
+    double alpha = 1.0;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        double *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        double* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        double *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        double* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* dLPanelTrSolve */
+
+int_t dLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    double* BlockUFactor,
+                    gridinfo_t *grid,
+                    dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    dUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* dLPanelUpdate */
+
+#define BL  32
+
+int_t dUPanelTrSolve( int_t k,  
+                     double* BlockLFactor,
+                     double* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     dLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        double *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        double *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                double *tempv = bigV +  thread_id * ldt * ldt;
+                dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        double *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            double *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            double *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    double *tempv = bigV +  thread_id * ldt * ldt;
+                    dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* dUPanelTrSolve */
+
+int_t dUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    double* BlockLFactor,
+                    double* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    dLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t dIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    double* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    dLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        double* lusup = Lnzval_bc_ptr[lk];
+
+        dIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(double));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            dIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, double* Uval_buf,
+    			     gridinfo_t *grid, dLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        double* uval = Unzval_br_ptr[lk];
+        dIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(double));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            dIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t dWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            dWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t dWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            dWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index 1879594e..0fd6d9a2 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -14,10 +14,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * 
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include 
@@ -392,6 +392,7 @@ void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, double c)
 
     return;
 }
+/**** end utilities added for SUNDIALS ****/
 
 /*! \brief Allocate storage in ScalePermstruct */
 void dScalePermstructInit(const int_t m, const int_t n,
@@ -420,9 +421,65 @@ void dScalePermstructFree(dScalePermstruct_t *ScalePermstruct)
         SUPERLU_FREE(ScalePermstruct->R);
         SUPERLU_FREE(ScalePermstruct->C);
         break;
+      default: break;
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int dDeAllocGlu_3d(dLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int dDeAllocLlu_3d(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* dDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void
@@ -431,8 +488,8 @@ dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx)
     int  i, j;
     for (j = 0; j < nrhs; ++j)
 	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/
-	    else x[i + j*ldx] = 1.0;
+	    if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n;
+	    else x[i + j*ldx] = 1.0 - (double)(i+1.)/n;
 	}
 }
 
@@ -559,7 +616,7 @@ void dPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 
 /*! \brief Sets all entries of matrix L to zero.
  */
-void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
+void dZeroLblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct)
 {
     double zero = 0.0;
     register int extra, gb, j, lb, nsupc, nsupr, ncb;
@@ -589,7 +646,7 @@ void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
             }
 	}
     }
-} /* dZeroLblocks */
+} /* end dZeroLblocks */
 
 
 /*! \brief Dump the factored matrix L using matlab triple-let format
@@ -598,8 +655,8 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, dLocalLU_t *Llu)
 {
     register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-	int_t nnzL, n,nmax;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
     int_t *xsup = Glu_persist->xsup;
     int_t *index;
     double *nzval;
@@ -652,7 +709,7 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		}
 
 	if(grid->iam==0){
-		fprintf(fp, "%d %d %d\n", n,n,nnzL);
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
 	}
 
      ncb = nsupers / grid->npcol;
@@ -1185,7 +1242,37 @@ void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
 	}
     }
-} /* DPRINTUBLOCKS */
+} /* end dPrintUlocks */
+
+/*! \brief Sets all entries of matrix U to zero.
+ */
+void dZeroUblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct)
+{
+    double zero = 0.0;
+    register int i, extra, lb, len, nrb;
+    register int myrow, r;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+    int nsupers = Glu_persist->supno[n-1] + 1;
+
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    len = index[1];  // number of entries in nzval[];
+	    for (i = 0; i < len; ++i) {
+	        nzval[i] = zero;
+	    }
+	}
+    }
+} /* end dZeroUlocks */
 
 int
 dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm,
diff --git a/SRC/get_perm_c_parmetis.c b/SRC/get_perm_c_parmetis.c
index 6381f4e5..b08c35ab 100644
--- a/SRC/get_perm_c_parmetis.c
+++ b/SRC/get_perm_c_parmetis.c
@@ -122,7 +122,7 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   int_t m, n, bnz, i, j;
   int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
   int_t *b_rowptr, *b_colind;
-  int_t *dist_order;
+  int_t *dist_order = NULL;
   int  *recvcnts, *displs;
   /* first row index on each processor when the matrix is distributed
      on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
@@ -260,7 +260,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   if (bnz) SUPERLU_FREE (b_colind);
   SUPERLU_FREE (b_rowptr);
 
-#if 0  
+#if 0  /* The following code is not needed anymore, because parmetis
+	  now supports 64bit indexing */
   if ( iam < noDomains) {
     SUPERLU_FREE (options);
   }
@@ -283,7 +284,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   dist_order = dist_order_int;
 #endif
 
-#endif
+#endif  /* not needed any more */
+
   
   /* Allgatherv dist_order to get perm_c */
   if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
diff --git a/SRC/gpu_wrapper.h b/SRC/gpu_wrapper.h
index bb644864..65499945 100644
--- a/SRC/gpu_wrapper.h
+++ b/SRC/gpu_wrapper.h
@@ -38,6 +38,15 @@
 #define gpuStreamCreateWithFlags cudaStreamCreateWithFlags
 #define gpuStreamDefault cudaStreamDefault
 #define gpublasStatus_t cublasStatus_t
+#define gpuEventCreate cudaEventCreate
+#define gpuEventRecord cudaEventRecord
+#define gpuMemGetInfo cudaMemGetInfo
+#define gpuOccupancyMaxPotentialBlockSize cudaOccupancyMaxPotentialBlockSize
+#define gpuEventElapsedTime cudaEventElapsedTime
+#define gpuDeviceReset cudaDeviceReset
+#define gpuMallocHost cudaMallocHost
+#define gpuEvent_t cudaEvent_t
+#define gpuMemset cudaMemset
 #define  GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS 
 #define  GPUBLAS_STATUS_NOT_INITIALIZED CUBLAS_STATUS_NOT_INITIALIZED 
 #define  GPUBLAS_STATUS_ALLOC_FAILED CUBLAS_STATUS_ALLOC_FAILED 
@@ -46,6 +55,8 @@
 #define  GPUBLAS_STATUS_MAPPING_ERROR CUBLAS_STATUS_MAPPING_ERROR 
 #define  GPUBLAS_STATUS_EXECUTION_FAILED CUBLAS_STATUS_EXECUTION_FAILED 
 #define  GPUBLAS_STATUS_INTERNAL_ERROR CUBLAS_STATUS_INTERNAL_ERROR 
+#define  GPUBLAS_STATUS_LICENSE_ERROR CUBLAS_STATUS_LICENSE_ERROR 
+#define  GPUBLAS_STATUS_NOT_SUPPORTED CUBLAS_STATUS_NOT_SUPPORTED 
 #define  gpublasCreate cublasCreate 
 #define  gpublasDestroy cublasDestroy
 #define  gpublasHandle_t cublasHandle_t
@@ -105,6 +116,15 @@
 #define gpuStreamCreateWithFlags hipStreamCreateWithFlags
 #define gpuStreamDefault hipStreamDefault
 #define gpublasStatus_t hipblasStatus_t
+#define gpuEventCreate hipEventCreate
+#define gpuEventRecord hipEventRecord
+#define gpuMemGetInfo hipMemGetInfo
+#define gpuOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define gpuEventElapsedTime hipEventElapsedTime
+#define gpuDeviceReset hipDeviceReset
+#define gpuMallocHost hipMallocHost
+#define gpuEvent_t hipEvent_t
+#define gpuMemset hipMemset
 #define  GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS 
 #define  GPUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED 
 #define  GPUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED 
@@ -113,6 +133,8 @@
 #define  GPUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR 
 #define  GPUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED 
 #define  GPUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR 
+#define  GPUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_LICENSE_ERROR 
+#define  GPUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED 
 #define  gpublasCreate hipblasCreate 
 #define  gpublasDestroy hipblasDestroy
 #define  gpublasHandle_t hipblasHandle_t
diff --git a/SRC/gpublas_utils.c b/SRC/gpublas_utils.c
index ad0c1534..dff19cf9 100644
--- a/SRC/gpublas_utils.c
+++ b/SRC/gpublas_utils.c
@@ -72,11 +72,14 @@ const char* gpublasGetErrorString(gpublasStatus_t status)
         case GPUBLAS_STATUS_MAPPING_ERROR: return "GPUBLAS_STATUS_MAPPING_ERROR";
         case GPUBLAS_STATUS_EXECUTION_FAILED: return "GPUBLAS_STATUS_EXECUTION_FAILED"; 
         case GPUBLAS_STATUS_INTERNAL_ERROR: return "GPUBLAS_STATUS_INTERNAL_ERROR"; 
+        case GPUBLAS_STATUS_LICENSE_ERROR: return "GPUBLAS_STATUS_LICENSE_ERROR"; 
+        case GPUBLAS_STATUS_NOT_SUPPORTED: return "GPUBLAS_STATUS_NOT_SUPPORTED"; 
     }
     return "unknown error";
 }
 
-inline
+/*error reporting functions */
+//inline
 gpuError_t checkGPU(gpuError_t result)
 {
 #if defined(DEBUG) || defined(_DEBUG)
diff --git a/SRC/gpublas_utils.h b/SRC/gpublas_utils.h
index 895bdc6d..72cab46c 100644
--- a/SRC/gpublas_utils.h
+++ b/SRC/gpublas_utils.h
@@ -22,6 +22,7 @@ at the top-level directory.
 #ifdef GPU_ACC
 
 #include "gpu_wrapper.h"
+typedef struct LUstruct_gpu_  LUstruct_gpu;  // Sherry - not in this distribution
 
 extern void DisplayHeader();
 extern const char* gpublasGetErrorString(gpublasStatus_t status);
diff --git a/SRC/memory.c b/SRC/memory.c
index 8a253e9a..bee42f8f 100644
--- a/SRC/memory.c
+++ b/SRC/memory.c
@@ -66,8 +66,8 @@ void *superlu_malloc_dist(size_t size)
     int iam;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &iam);
-    if ( size <= 0 ) {
-	printf("(%d) superlu_malloc size %lld\n", iam, size);
+    if ( size < 0 ) {
+	printf("(%d) superlu_malloc size %lu\n", iam, size);
 	ABORT("superlu_malloc: nonpositive size");
     }
 // #ifdef GPU_ACC    
@@ -77,7 +77,7 @@ void *superlu_malloc_dist(size_t size)
 // #endif
 	
     if ( !buf ) {
-	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lld\n",
+	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lu\n",
 	       iam, superlu_malloc_total*1e-6, size);
 	ABORT("superlu_malloc: out of memory");
     }
@@ -143,7 +143,7 @@ void superlu_free_dist(void *addr) { gpuError_t error = gpuFree(addr);}
 #else 
 
 #if  0 
-// #if (__STDC_VERSION__ >= 201112L)    // this is very slow on tulip
+// #if (__STDC_VERSION__ >= 201112L)   // cannot compile on Summit, also this is very slow on tulip
 
 void * superlu_malloc_dist(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);}
 void   superlu_free_dist(void * ptr)    {free(ptr);}
@@ -426,8 +426,7 @@ int_t symbfact_SubXpand
     
 #if ( DEBUGlevel>=1 )
     printf("symbfact_SubXpand(): jcol " IFMT ", next " IFMT ", maxlen " IFMT
-	   ", MemType " IFMT "\n",
-	   jcol, next, *maxlen, mem_type);
+	   ", MemType %d\n", jcol, next, *maxlen, mem_type);
 #endif    
 
     new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
new file mode 100644
index 00000000..ee57c7e7
--- /dev/null
+++ b/SRC/pd3dcomm.c
@@ -0,0 +1,876 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_ddefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(double), MPI_INFO_NULL, &(a)))
+
+int_t dAllocLlu(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    double  **Lnzval_bc_ptr =
+	(double **) SUPERLU_MALLOC(sizeof(double*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    double  **Unzval_br_ptr =
+	(double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* dAllocLlu */
+
+int_t dmpiMallocLUStruct(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    double * uval = Unzval_br_ptr[lb];
+	    double * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(double));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    double *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(double));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t dzSendLPanel(int_t k, int_t receiver,
+                   dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(double);
+		}
+	}
+    return 0;
+}
+
+
+int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
+                    double* Lval_buf,
+                    dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_dscal(len2, alpha, lnzval, 1);
+		    superlu_daxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t dzSendUPanel(int_t k, int_t receiver,
+                    dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    double* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(double);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
+                    double* Uval_buf, dLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    double* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, MPI_DOUBLE, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_dscal(lenv, alpha, unzval, 1);
+		    superlu_daxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t dp3dScatter(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	dAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	dAllocLlu(nsupers, LUstruct, grid3d);
+    
+    dLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    dscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    dscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* dp3dScatter */
+
+
+int_t dscatter3dUPanels(int_t nsupers,
+		       dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	double * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end dScatter3dUPanels */
+
+
+int_t dscatter3dLPanels(int_t nsupers,
+                       dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = doubleCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, MPI_DOUBLE, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* dscatter3dLPanels */
+
+int_t dcollect3dLpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    double* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	double * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    dcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    dcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t dzeroSetLU(int_t nnodes, int_t* nodeList, dLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    double* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(double) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    double* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(double) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t dreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        double* Lval_buf, double* Uval_buf,
+                        dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 1.0, beta = 1.0;	
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t dgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        dLUValSubBuf_t* LUvsb,
+                        dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 0.0, beta = 1.0;	
+    double * Lval_buf  = LUvsb->Lval_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, dLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             dLUValSubBuf_t* LUvsb, dLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    double * Lval_buf  = LUvsb->Lval_buf;
+    double * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t dgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    dLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    dgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* dgatherAllFactoredLU */
+
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
index 4b7e9589..36ee80b0 100644
--- a/SRC/pddistribute.c
+++ b/SRC/pddistribute.c
@@ -13,9 +13,10 @@ at the top-level directory.
 /*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * 
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * 
*/ @@ -75,9 +76,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ia, *ja, **ia_send, *index, *itemp = NULL; int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; + double *aij, **aij_send, *nzval, *dtemp = NULL; double *nzval_a; double asum,asum_tot; int iam, it, p, procs, iam_g; @@ -143,8 +144,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, ABORT("Malloc fails for ia[]."); if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); + ja = ia + k; } - ja = ia + k; /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { @@ -172,9 +173,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { - ia_send[p] = &index[i]; + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } @@ -218,7 +219,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); @@ -229,7 +231,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; @@ -248,7 +251,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang + //if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } diff --git a/SRC/pdgsequ.c b/SRC/pdgsequ.c index 8702fbf5..95adf5b6 100644 --- a/SRC/pdgsequ.c +++ b/SRC/pdgsequ.c @@ -13,7 +13,7 @@ at the top-level directory. /*! @file * \brief Computes row and column scalings * - * File name: pdgsequ.c + * File name: pdgsequ.c * History: Modified from LAPACK routine DGEEQU */ #include @@ -84,7 +84,7 @@ at the top-level directory. void pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info, gridinfo_t *grid) + double *colcnd, double *amax, int_t *info, gridinfo_t *grid) { /* Local variables */ @@ -102,20 +102,22 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Test the input parameters. */ *info = 0; if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - pxerr_dist("pdgsequ", grid, i); - return; + A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) + { + i = -(*info); + pxerr_dist("pdgsequ", grid, i); + return; } /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; + if ( A->nrow == 0 || A->ncol == 0 ) + { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; } Astore = A->Store; @@ -131,43 +133,49 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum element in each row. */ irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); - ++irow; + for (i = 0; i < m_loc; ++i) + { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j) + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); + ++irow; } /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; - for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); + for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) + { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); } /* Get the global MAX and MIN for R */ tempmax = rcmax; tempmin = rcmin; MPI_Allreduce( &tempmax, &rcmax, - 1, MPI_DOUBLE, MPI_MAX, grid->comm); + 1, MPI_DOUBLE, MPI_MAX, grid->comm); MPI_Allreduce( &tempmin, &rcmin, - 1, MPI_DOUBLE, MPI_MIN, grid->comm); + 1, MPI_DOUBLE, MPI_MIN, grid->comm); *amax = rcmax; - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + if (rcmin == 0.) + { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) + { + *info = i + 1; + return; + } + } + else + { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } /* Compute column scale factors */ @@ -176,17 +184,19 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum element in each column, assuming the row scalings computed above. */ irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); - } - ++irow; + for (i = 0; i < m_loc; ++i) + { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j) + { + jcol = Astore->colind[j]; + c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); + } + ++irow; } /* Find the global maximum for c[j] */ if ( !(loc_max = doubleMalloc_dist(A->ncol))) - ABORT("Malloc fails for loc_max[]."); + ABORT("Malloc fails for loc_max[]."); for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm); SUPERLU_FREE(loc_max); @@ -194,34 +204,39 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); + for (j = 0; j < A->ncol; ++j) + { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); } - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + if (rcmin == 0.) + { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) + { + *info = A->nrow + j + 1; + return; + } + } + else + { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } /* gather R from each process to get the global R. */ procs = grid->nprow * grid->npcol; if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) - ABORT("Malloc fails for r_sizes[]."); + ABORT("Malloc fails for r_sizes[]."); displs = r_sizes + procs; if ( !(loc_r = doubleMalloc_dist(m_loc))) - ABORT("Malloc fails for loc_r[]."); + ABORT("Malloc fails for loc_r[]."); j = Astore->fst_row; for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; @@ -230,15 +245,14 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Set up the displacements for allgatherv */ displs[0] = 0; - for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; + for (i = 1; i < procs; ++i) displs[i] = displs[i - 1] + r_sizes[i - 1]; /* Now gather the actual data */ MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs, - MPI_DOUBLE, grid->comm); + MPI_DOUBLE, grid->comm); SUPERLU_FREE(r_sizes); SUPERLU_FREE(loc_r); - return; } /* pdgsequ */ diff --git a/SRC/pdgsmv.c b/SRC/pdgsmv.c index db767647..1b2882f4 100644 --- a/SRC/pdgsmv.c +++ b/SRC/pdgsmv.c @@ -373,11 +373,11 @@ void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm) int_t *it; double *dt; SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); } diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index d42898e8..d0160c52 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -326,7 +326,7 @@ at the top-level directory. * = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows * of the original matrix to make the diagonal large * relative to the off-diagonal. - * = LargeDiag_APWM: use the parallel approximate-weight perfect + * = LargeDiag_HPWM: use the parallel approximate-weight perfect * matching to permute rows of the original matrix * to make the diagonal large relative to the * off-diagonal. @@ -408,7 +408,7 @@ at the top-level directory. * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * - * o R (double*) dimension (A->nrow) + * o R (double *) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). @@ -416,7 +416,7 @@ at the top-level directory. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * - * o C (double*) dimension (A->ncol) + * o C (double *) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). @@ -492,7 +492,7 @@ at the top-level directory. * * info (output) int* * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -590,13 +590,13 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, /* Test the input parameters. */ *info = 0; Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) + if ( Fact < DOFACT || Fact > FACTORED ) *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) + else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR ) *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) + else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC ) *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA ) + else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA ) *info = -1; else if ( options->IterRefine == SLU_EXTRA ) { *info = -1; @@ -667,6 +667,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -914,7 +915,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif - } else { /* use largeDiag_AWPM */ + } else { /* use LargeDiag_HWPM */ #ifdef HAVE_COMBBLAS d_c2cpp_GetHWPM(A, grid, ScalePermstruct); #else @@ -1061,7 +1062,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) { - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); fflush(stdout); } @@ -1083,10 +1084,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - (int) sizeof(int_t), (int) sizeof(short), - (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1226,11 +1227,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); - if (!iam_g) { - print_options_dist(options); - fflush(stdout); - } - printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz " IFMT " nnzLU %ld sum %e lsum %e N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol); fflush(stdout); #endif @@ -1321,7 +1317,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, avg * 1e-6, avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); fflush(stdout); } } /* end printing stats */ @@ -1585,6 +1582,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c new file mode 100644 index 00000000..1a74194b --- /dev/null +++ b/SRC/pdgssvx3d.c @@ -0,0 +1,1589 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + dScalePermstruct_t * ScalePermstruct, + double B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + dLUstruct_t * LUstruct, dSOLVEstruct_t * SOLVEstruct, + double *berr, SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore = A->Store; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + LUstruct->dt = 'd'; + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_D || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < Astore->m_loc) + *info = -5; + else if (nrhs < 0) { + *info = -6; + } + if (*info) { + i = -(*info); + pxerr_dist ("pdgssvx3d", grid, -(*info)); + return; + } + + /* Initialization. */ + + + options->Algo3d = YES; + + /* definition of factored seen by each process layer */ + factored = (Fact == FACTORED); + + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d, + so that the names {ldb, B, and Astore} can be used internally. + B3d and Astore3d will be assigned back to B and Astore on return.*/ + int ldb3d = ldb; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + NRformat_loc3d *A3d = SOLVEstruct->A3d; + + /* B3d is aliased to B; + B2d is allocated; + B is then aliased to B2d for the following 2D solve; + */ + dGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d, &A3d); + + B = (double *) A3d->B2d; /* B is now pointing to B2d, + allocated in dGatherNRformat_loc3d. */ + //PrintDouble5("after gather B=B2d", ldb, B); + + SOLVEstruct->A3d = A3d; /* This structure need to be persistent across + multiple calls of pdgssvx3d() */ + + NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 + NRformat_loc *A_orig = A->Store; +////// + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter pdgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: + - equilibration, + - ordering, + - symbolic factorization, + - distribution of L & U */ + + if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; // on 2D grid-0 + ldb = Astore0->m_loc; + + /* The following code now works on 2D grid-0 */ + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + /* Extract equilibration status from a previous factorization */ + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else { + rowequ = colequ = FALSE; + } + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if ( !factored ) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (double *) GAstore->nzval; + GA_mem_use += nnz * sizeof (double); + } + + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); + else if (job == 5) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + d_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pdlangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if ( !factored ) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + dp3dScatter(n, LUstruct, grid3d); + + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + d3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + dDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); + + } /* end if not Factored ... factor on all process layers */ + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if ( (nrhs > 0) && (*info == 0) ) { + if (!(b_work = doubleMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs > 0 and factor successful */ + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + + /* Scatter the solution from 2D grid-0 to 3D grid */ + if ( nrhs > 0 ) dScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit pdgssvx3d()"); +#endif + +} diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c index f988bf94..14cf4f2d 100644 --- a/SRC/pdgssvx_ABglobal.c +++ b/SRC/pdgssvx_ABglobal.c @@ -588,6 +588,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -876,7 +877,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); @@ -899,7 +900,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, printf("\tint %d, short %d, float %d, double %d\n", (int) sizeof(int_t), (int) sizeof(short), (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1098,6 +1099,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } if ( !factored || (factored && options->IterRefine) ) diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index d15d31f9..9b2d056e 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -109,7 +109,6 @@ at the top-level directory. */ #include -/*#include "mkl.h"*/ #include "superlu_ddefs.h" #include "gpublas_utils.h" #ifdef GPU_ACC @@ -128,7 +127,7 @@ at the top-level directory. // #define SUPERNODE_PROFILE /* - Name : BAELINE + Name : BASELINE Purpose : baseline to compare performance against Overhead : NA : this won't be used for running experiments */ @@ -769,11 +768,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, SUPERLU_MAX (max_row_size * num_threads * ldt, get_max_buffer_size ()); */ -#ifdef GPU_ACC +#ifdef GPU_ACC /*-------- use GPU --------*/ int gpublas_nb = get_gpublas_nb(); // default 64 - int nstreams = get_num_gpu_streams (); + int nstreams = get_num_gpu_streams (); // default 8 - int buffer_size = SUPERLU_MAX(max_row_size*nstreams*gpublas_nb,get_max_buffer_size()); + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8)); + // get_max_buffer_size()); /* array holding last column blk for each partition, used in SchCompUdt--GPU.c */ #if 0 @@ -785,8 +785,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /* not to use GPU */ int Threads_per_process = get_thread_per_process(); - int_t buffer_size = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size()); -#endif /* end ifdef GPU_ACC */ + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8)); + // get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ int_t max_ncols = 0; #if 0 @@ -813,8 +814,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigV = NULL; #if ( PRNTlevel>=1 ) - if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", - max_row_size, max_ncols); + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); fflush(stdout); #endif @@ -829,16 +834,19 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #endif #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size); + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); fflush(stdout); #endif if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(double) ,gpuHostMallocDefault)) ) ABORT("Malloc fails for dgemm buffer V"); - if ( iam==0 )DisplayHeader(); - #if ( PRNTlevel>=1 ) - printf(" Starting with %d GPU Streams \n",nstreams ); + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d GPU Streams \n",nstreams ); + fflush(stdout); + } #endif gpublasHandle_t *handle; @@ -881,10 +889,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, return 1; } - stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) - + bigu_size + buffer_size ) * dword; + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC -#else /*-- not to use GPU --*/ +#else /*-------- not to use GPU --------*/ // for GEMM padding 0 j = bigu_size / ldt; @@ -892,7 +901,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad)); #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size); + printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size); fflush(stdout); #endif @@ -906,7 +915,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, ABORT ("Malloc failed for dgemm V buffer"); //#endif -#endif /* end ifdef GPU_ACC */ +#endif +/*************** end ifdef GPU_ACC ****************/ log_memory((bigv_size + bigu_size) * dword, stat); @@ -1759,29 +1769,29 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm); if ( iam==0 ) { - printf("\nInitialization time\t%8.2lf seconds\n" + printf("\nInitialization time\t%8.4lf seconds\n" "\t Serial: compute static schedule, allocate storage\n", InitTimer); printf("\n==== Time breakdown in factorization (rank 0) ====\n"); - printf("Panel factorization \t %8.2lf seconds\n", + printf("Panel factorization \t %8.4lf seconds\n", pdgstrf2_timer + pdgstrs2_timer); - printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer); - printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer); - printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer); - printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer); - printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); - printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer); - - printf(".. Time in GEMM %8.2lf \n", + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", LookAheadGEMMTimer + RemainGEMMTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer); - printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); - printf(".. Time to Scatter %8.2lf \n", + printf(".. Time to Scatter %8.4lf \n", LookAheadScatterTimer + RemainScatterTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer); - printf("\t* Remain\t %8.2lf \n", RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); - printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer); + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); printf("--------\n"); printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); } diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index 168fa750..8c5a1933 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -213,9 +213,8 @@ pdgstrf2_trsm for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; - /* Not to replace zero pivot. */ - // if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0 ) { - if (options->ReplaceTinyPivot == YES) { + /* May replace zero pivot. */ + if (options->ReplaceTinyPivot == YES ) { if (fabs (lusup[i]) < thresh) { /* Diagonal */ #if ( PRNTlevel>=2 ) @@ -363,44 +362,35 @@ pdgstrf2_trsm } /* PDGSTRF2_trsm */ -#if 0 /* COMMENT OUT 3D CODE FOR NOW */ /***************************************************************************** * The following functions are for the new pdgstrf2_dtrsm in the 3D code. *****************************************************************************/ static -int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, - double* lusup, int_t nsupr, SCT_t* SCT) +int_t LpanelUpdate(int off0, int nsupc, double* ublk_ptr, int ld_ujrow, + double* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; double alpha = 1.0; - unsigned long long t1 = _rdtsc(); + double t1 = SuperLU_timer_(); #define GT 32 +#ifdef _OPENMP #pragma omp parallel for +#endif for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; - int_t len = SUPERLU_MIN(GT, l - i * GT); -#if 1 - #if defined (USE_VENDOR_BLAS) - dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr, - 1, 1, 1, 1); - #else - dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr); - #endif -#else - cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); -#endif + int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } /* for i = ... */ - t1 = _rdtsc() - t1; + t1 = SuperLU_timer_() - t1; - SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; @@ -408,13 +398,30 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, #pragma GCC push_options #pragma GCC optimize ("O0") -/*factorizes the diagonal block; called from process that owns the (k,k) block*/ +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    double *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, dLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     double alpha = -1, zero = 0.0;
 
@@ -424,8 +431,8 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     int_t jfst = FstBlockC (k);
     int_t jlst = FstBlockC (k + 1);
     double *lusup = Llu->Lnzval_bc_ptr[lk];
-    int_t nsupc = SuperSize (k);
-    int_t nsupr;
+    int nsupc = SuperSize (k);
+    int nsupr;
     if (Llu->Lrowind_bc_ptr[lk])
         nsupr = Llu->Lrowind_bc_ptr[lk][1];
     else
@@ -433,18 +440,19 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     double *ublk_ptr = BlockUFactor;
     double *ujrow = BlockUFactor;
     int_t luptr = 0;                  /* Point_t to the diagonal entries. */
-    int_t cols_left = nsupc;          /* supernode size */
+    int cols_left = nsupc;          /* supernode size */
     int_t u_diag_cnt = 0;
     int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
-    int_t incx = 1;
-    int_t incy = ld_ujrow;
+    int incx = 1;
+    int incy = ld_ujrow;
 
     for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
     {
         /* Diagonal pivot */
         int_t i = luptr;
-        /* Not to replace zero pivot.  */
-        if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
         {
             if (fabs (lusup[i]) < thresh) {  /* Diagonal */
 
@@ -485,17 +493,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
         if (--cols_left)
         {
             /*following must be int*/
-            int_t l = nsupc - j - 1;
+            int l = nsupc - j - 1;
 
 	    /* Rank-1 update */
-#if 1
-	    dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
-		   &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], &nsupr);
-#else
-            cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx,
-                        &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
-                        nsupr);
-#endif
+            superlu_dger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
             stat->ops[FACT] += 2 * l * cols_left;
         }
 
@@ -506,8 +508,8 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
 
     //int_t thread_id = omp_get_thread_num();
-    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1);
-}
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Dgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
@@ -712,7 +714,7 @@ int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 
 int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 			      int_t *usub, double *uval, double *tempv,
-			      int_t knsupc, int_t nsupr, double *lusup,
+			      int_t knsupc, int nsupr, double *lusup,
 			      Glu_persist_t *Glu_persist)    /*glupersist for xsup for supersize*/
 {
     double alpha = 1.0;
@@ -725,34 +727,22 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 
     // printf("klst inside task%d\n", );
     /*find ldu */
-    int_t ldu = 0;
+    int ldu = 0;
     for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
     {
         ldu = SUPERLU_MAX( klst - usub[jj], ldu) ;
     }
 
     /*pack U block into a dense Block*/
-    int_t ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
+    int ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
     	                           uval, tempv);
 
     /*now call dtrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
-
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu,
-	     1, 1, 1, 1);
-  #else
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu);
-  #endif
-#else
-
-    cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
-                 ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu);
-#endif
+    
+    superlu_dtrsm("L", "L", "N", "U", ldu, ncols, alpha,
+		  &lusup[luptr], nsupr, tempv, ldu);
 
     /*now scatter the output into sparse U block*/
     dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
@@ -760,10 +750,9 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 }
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
-#if 1 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -845,8 +834,10 @@ void pdgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -889,7 +880,9 @@ void pdgstrs2_omp
 #endif
 	    } /* end if segsize > 0 */
 	} /* end for j in parallel ... */
+#ifdef _OPENMP    
 /* #pragma omp taskwait */
+#endif
     }  /* end for b ... */
 
 #ifndef USE_Ublock_info
@@ -912,7 +905,7 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, dLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -933,20 +926,22 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         double *tempv = bigV +  thread_id * ldt * ldt;
         dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 } /* pdgstrs2_omp new version from Piyush */
 
-#endif
+#endif /* there are 2 versions of pdgstrs2_omp */
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
new file mode 100644
index 00000000..27fe6bfe
--- /dev/null
+++ b/SRC/pdgstrf3d.c
@@ -0,0 +1,392 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "dlustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PDGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) dLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + dLocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + /* Test the input parameters. */ + *info = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()"); +#endif + + // Initilize stat + stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + +#if 0 // sherry: not used + ddiagFactBufs_t dFBuf; + dinitDiagFactBufs(ldt, &dFBuf); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + msgs_t msgs; + initMsgs(&msgs); +#endif + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + dscuBufs_t scuBufs; + dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + + /* Initializing factorization specific buffers */ + + int_t numLA = getNumLookAhead(options); + dLUValSubBuf_t** LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + ddiagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); + + /* Setting up GPU related data structures */ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); + + dInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + + int superlu_acc_offload = HyP->superlu_acc_offload; + + //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); + HyP->bigu_size = bigu_size; + int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size (); + HyP->buffer_size = buffer_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + dLUstruct_gpu_t *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + dsluGPU_t sluGPUobj; + dsluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { +#if 0 /* Sherry: For GPU code on titan, we do not need performance + lookup tables since due to difference in CPU-GPU performance, + it didn't make much sense to do any Schur-complement update + on CPU, except for the lookahead-update on CPU. Same should + hold for summit as well. (from Piyush) */ + + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif +#endif + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); + + /* Initialize GPU data structures */ + dinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nGPUStreams = sluGPU->nGPUStreams; + } + +#endif // end GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + dsparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + dsparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + dreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */ + +#ifdef GPU_ACC + /* This frees the GPU storage allocateed in initSluGPU3D_t() */ + if (superlu_acc_offload) { + dfree_LUstruct_gpu (sluGPU->A_gpu); + } +#endif + + /* Prepare error message - find the smallesr index i that U(i,i)==0 */ + int iinfo; + if ( *info == 0 ) *info = n + 1; + MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm); + if ( iinfo == n + 1 ) *info = 0; + else *info = iinfo; + //printf("After factorization: INFO = %d\n", *info); fflush(stdout); + + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + dfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + dLluBufFreeArr(numLA, LUvsbs); + dfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()"); +#endif + return 0; + +} /* pdgstrf3d */ diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c index dc23d0cf..be43fbc1 100644 --- a/SRC/pdgstrs.c +++ b/SRC/pdgstrs.c @@ -1158,7 +1158,13 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct, int_t cnt1,cnt2; -#ifdef GPU_ACC +#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE) /* GPU trisolve*/ + +#if ( PRNTlevel>=1 ) + if ( !iam) printf(".. GPU trisolve\n"); + fflush(stdout); +#endif + #ifdef GPUREF @@ -1546,8 +1552,7 @@ if(procs==1){ // fflush(stdout); // } - -#ifdef GPU_ACC /* CPU trisolve*/ +#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE) /* GPU trisolve*/ // #if 0 /* CPU trisolve*/ #ifdef GPUREF /* use cuSparse*/ @@ -2632,7 +2637,7 @@ thread_id=0; -#ifdef GPU_ACC /*GPU trisolve*/ +#if defined(GPU_ACC) && defined(SLU_HAVE_LAPACK) && defined(GPU_SOLVE) /* GPU trisolve*/ // #if 0 /* CPU trisolve*/ d_grid = NULL; diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c index 6c9af91f..9aa84033 100644 --- a/SRC/pdgstrs_lsum.c +++ b/SRC/pdgstrs_lsum.c @@ -14,14 +14,15 @@ at the top-level directory. * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * 
*/ @@ -528,11 +529,11 @@ void dlsum_fmod_inv for (nn=0;nn=1 ) TOC(t2, t1); @@ -598,114 +599,113 @@ void dlsum_fmod_inv #endif for (lb=lbstart;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + // fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id1]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif @@ -716,30 +716,30 @@ void dlsum_fmod_inv #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - } + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } - /* - * Perform local block modifications. - */ + /* + * Perform local block modifications. + */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif - { +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); - } + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } - // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ } /* if fmod[lk] == 0 */ } - } + } /* end tasklook for nn ... */ } }else{ @@ -781,16 +781,16 @@ void dlsum_fmod_inv il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd - #endif - for (i = 0; i < nbrow1; ++i) { - irow = lsub[lptr+i] - rel; /* Relative row. */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; - } + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ // TOC(t3, t1); @@ -801,94 +801,91 @@ void dlsum_fmod_inv for (lb=0;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; #endif - stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ + /* + * Send Xk to process column Pc[k]. + */ - if(LBtree_ptr[lk].empty_==NO){ + if(LBtree_ptr[lk].empty_==NO){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d'); - } - - /* - * Perform local block modifications. - */ + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d'); + } - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) - // #endif + /* + * Perform local block modifications. + */ - { - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); - } +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + } /* end else iam == p */ } /* if fmod[lk] == 0 */ } // } @@ -950,7 +946,6 @@ void dlsum_fmod_inv stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ } /* dLSUM_FMOD_INV */ @@ -1134,22 +1129,23 @@ void dlsum_fmod_inv_master il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd lastprivate(irow) - #endif + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } /* end if (lbstart=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; #endif - } - // TOC(t3, t1); + } /* end else ... */ + // TOC(t3, t1); rtemp_loc = &rtemp[sizertemp* thread_id]; for (lb=0;lb=1 ) TIC(t1); #endif for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_TRSM] += t2; - #endif stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; @@ -1341,13 +1333,12 @@ void dlsum_fmod_inv_master * Perform local block modifications. */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif { nlb1 = lsub1[0] - 1; - dlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, xsup, grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); @@ -1359,8 +1350,8 @@ void dlsum_fmod_inv_master } // } stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ -} /* dLSUM_FMOD_INV */ + } /* end if nlb>0*/ +} /* end dlsum_fmod_inv_master */ @@ -1420,7 +1411,7 @@ void dlsum_bmod_inv float msg_vol = 0, msg_cnt = 0; int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; int_t iword = sizeof(int_t); - int_t dword = sizeof (double); + int_t dword = sizeof(double); int_t aln_d,aln_i; aln_d = 1;//ceil(CACHELINE/(double)dword); aln_i = 1;//ceil(CACHELINE/(double)iword); @@ -1484,15 +1475,15 @@ void dlsum_bmod_inv fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); } - } /* for jj ... */ + } /* end for jj ... */ } #if ( PROFlevel>=1 ) @@ -1500,7 +1491,6 @@ void dlsum_bmod_inv stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - #ifdef _OPENMP #pragma omp atomic capture #endif @@ -1512,9 +1502,9 @@ void dlsum_bmod_inv if ( iam != p ) { for (ii=1;ii=1 ) TIC(t1); #endif - for (ii=1;iiops[SOLVE] += 2 * (iklrow - fnz); + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); } } /* for jj ... */ } @@ -1694,9 +1683,9 @@ void dlsum_bmod_inv if ( iam != p ) { for (ii=1;ii16){ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied - // #endif +// if(Urbs[lk1]>16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif // dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, // stat, root_send, nroot_send, sizelsum,sizertemp); //}else{ - dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); //} // } /* if brecv[ik] == 0 */ } } /* if bmod[ik] == 0 */ - } /* for ub ... */ - } + } /* end for ub ... */ + } /* end else ... */ } /* dlSUM_BMOD_inv */ @@ -1951,9 +1940,9 @@ void dlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -1993,9 +1982,9 @@ void dlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -2029,9 +2018,9 @@ void dlsum_bmod_inv_master if ( iam != p ) { for (ii=1;iigstrs_comm); - - if ( options->RefineInitialized ) { - pdgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; + if ( options->SolveInitialized ) { + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + + if ( options->RefineInitialized ) { + pdgsmv_finalize(SOLVEstruct->gsmv_comm); + options->RefineInitialized = NO; + } + SUPERLU_FREE(SOLVEstruct->gsmv_comm); + SUPERLU_FREE(SOLVEstruct->row_to_proc); + SUPERLU_FREE(SOLVEstruct->inv_perm_c); + SUPERLU_FREE(SOLVEstruct->diag_procs); + SUPERLU_FREE(SOLVEstruct->diag_len); + if ( SOLVEstruct->A_colind_gsmv ) + SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); + options->SolveInitialized = NO; } - SUPERLU_FREE(SOLVEstruct->gsmv_comm); - SUPERLU_FREE(SOLVEstruct->row_to_proc); - SUPERLU_FREE(SOLVEstruct->inv_perm_c); - SUPERLU_FREE(SOLVEstruct->diag_procs); - SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); - options->SolveInitialized = NO; } /* dSolveFinalize */ +void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d) +{ + /* free A2d and B2d, which are allocated only in 2D layer grid-0 */ + NRformat_loc3d *A3d = SOLVEstruct->A3d; + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE( A2d->rowptr ); + SUPERLU_FREE( A2d->colind ); + SUPERLU_FREE( A2d->nzval ); + } + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts + SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); + SUPERLU_FREE(A3d->procs_to_send_list); + SUPERLU_FREE(A3d->send_count_list); + SUPERLU_FREE(A3d->procs_recv_from_list); + SUPERLU_FREE(A3d->recv_count_list); + SUPERLU_FREE( A2d ); // free 2D structure + SUPERLU_FREE( A3d ); // free 3D structure +} /* dDestroy_A3d_gathered_on_2d */ + + /*! \brief Check the inf-norm of the error vector */ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, - double xtrue[], int_t ldxtrue, gridinfo_t *grid) + double xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; double *x_work, *xtrue_work; @@ -911,8 +937,8 @@ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); diff --git a/SRC/ps3dcomm.c b/SRC/ps3dcomm.c new file mode 100644 index 00000000..2c113562 --- /dev/null +++ b/SRC/ps3dcomm.c @@ -0,0 +1,876 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Communication routines for the 3D algorithm. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(float), MPI_INFO_NULL, &(a)))
+
+int_t sAllocLlu(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    float  **Lnzval_bc_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    float  **Unzval_br_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* sAllocLlu */
+
+int_t smpiMallocLUStruct(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    float * uval = Unzval_br_ptr[lb];
+	    float * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(float));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    float *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(float));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t szSendLPanel(int_t k, int_t receiver,
+                   sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(float);
+		}
+	}
+    return 0;
+}
+
+
+int_t szRecvLPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Lval_buf,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(len2, alpha, lnzval, 1);
+		    superlu_saxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t szSendUPanel(int_t k, int_t receiver,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(float);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t szRecvUPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Uval_buf, sLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(lenv, alpha, unzval, 1);
+		    superlu_saxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t sp3dScatter(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocLlu(nsupers, LUstruct, grid3d);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    sscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    sscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* sp3dScatter */
+
+
+int_t sscatter3dUPanels(int_t nsupers,
+		       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	float * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = floatMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, MPI_FLOAT, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end sScatter3dUPanels */
+
+
+int_t sscatter3dLPanels(int_t nsupers,
+                       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = floatCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, MPI_FLOAT, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* sscatter3dLPanels */
+
+int_t scollect3dLpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, MPI_FLOAT, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t scollect3dUpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	float * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, MPI_FLOAT, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, MPI_FLOAT, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t sp3dCollect(int_t layer, int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    scollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    scollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t szeroSetLU(int_t nnodes, int_t* nodeList, sLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    float* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(float) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    float* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(float) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t sreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        float* Lval_buf, float* Uval_buf,
+                        sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 1.0, beta = 1.0;	
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t sgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        sLUValSubBuf_t* LUvsb,
+                        sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 0.0, beta = 1.0;	
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t sinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, sLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    szeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int sreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             sLUValSubBuf_t* LUvsb, sLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    sreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t sgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    sLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    sgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* sgatherAllFactoredLU */
+
diff --git a/SRC/psGetDiagU.c b/SRC/psGetDiagU.c
new file mode 100644
index 00000000..9303baa9
--- /dev/null
+++ b/SRC/psGetDiagU.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file p@(pre)GetDiagU.c
+ * \brief Extracts the main diagonal of matrix U
+ *
+ * 
+ * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Xiaoye S. Li
+ * Created:  April 16, 2002
+ * Modified: May 15, 2016
+ * 
+ */ + + + +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
+ *
+ * Arguments
+ * =========
+ *
+ * n        (input) int
+ *          Dimension of the matrix.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *          The data structures to store the distributed L and U factors.
+ *          see superlu_ddefs.h for its definition.
+ *
+ * grid     (input) gridinfo_t*
+ *          The 2D process mesh. It contains the MPI communicator, the number
+ *          of process rows (NPROW), the number of process columns (NPCOL),
+ *          and my process rank. It is an input argument to all the
+ *          parallel routines.
+ *
+ * diagU    (output) double*, dimension (n)
+ *          The main diagonal of matrix U.
+ *          On exit, it is available on all processes.
+ *
+ *
+ * Note
+ * ====
+ *
+ * The diagonal blocks of the L and U matrices are stored in the L
+ * data structures, and are on the diagonal processes of the
+ * 2D process grid.
+ *
+ * This routine is modified from gather_diag_to_all() in psgstrs_Bglobal.c.
+ * 
+ */ +void psGetDiagU(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *diagU) +{ + + int_t *xsup; + int iam, knsupc, pkk; + int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int_t i, j, jj, k, lk, lwork, nsupers, p; + int_t num_diag_procs, *diag_procs, *diag_len; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float *sblock, *swork, *lusup; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); + if ( !(swork = floatMalloc_dist(jj)) ) ABORT("Malloc fails for swork[]"); + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy diagonal into buffer dwork[]. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBj( k, grid ); + nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ + lusup = Llu->Lnzval_bc_ptr[lk]; + for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */ + swork[lwork+i] = lusup[i*(nsupr+1)]; + lwork += knsupc; + } + MPI_Bcast( swork, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( swork, diag_len[p], MPI_FLOAT, pkk, grid->comm ); + } + + /* Scatter swork[] into global diagU vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + sblock = &diagU[FstBlockC( k )]; + for (i = 0; i < knsupc; ++i) sblock[i] = swork[lwork+i]; + lwork += knsupc; + } + } /* for p = ... */ + + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + SUPERLU_FREE(swork); +} diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c new file mode 100644 index 00000000..30c114cb --- /dev/null +++ b/SRC/psdistribute.c @@ -0,0 +1,1988 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Re-distribute A on the 2D process mesh. + *
+ * -- Distributed SuperLU routine (version 7.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
+ * 
+ */ + +#include "superlu_sdefs.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * colptr (output) int*
+ *
+ * rowind (output) int*
+ *
+ * a      (output) float*
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * 
+ */ +int_t +sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, + gridinfo_t *grid, int_t *colptr[], int_t *rowind[], + float *a[]) +{ + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize,nnz_tot; + int_t nnz_loc; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp = NULL; + int_t *ptr_to_send; + float *aij, **aij_send, *nzval, *dtemp = NULL; + float *nzval_a; + float asum,asum_tot; + int iam, it, p, procs, iam_g; + MPI_Request *send_req; + MPI_Status status; + + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sReDistribute_A()"); +#endif + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nnzToRecv = intCalloc_dist(2*procs); + nnzToSend = nnzToRecv + procs; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); + + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + } + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + + /* Allocate space for storing the triplets after redistribution. */ + if ( k ) { /* count can be zero. */ + if ( !(ia = intMalloc_dist(2*k)) ) + ABORT("Malloc fails for ia[]."); + if ( !(aij = floatMalloc_dist(k)) ) + ABORT("Malloc fails for aij[]."); + ja = ia + k; + } + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) + ABORT("Malloc fails for ia_send[]."); + if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) ) + ABORT("Malloc fails for aij_send[]."); + if ( SendCnt ) { /* count can be zero */ + if ( !(index = intMalloc_dist(2*SendCnt)) ) + ABORT("Malloc fails for index[]."); + if ( !(nzval = floatMalloc_dist(SendCnt)) ) + ABORT("Malloc fails for nzval[]."); + } + if ( !(ptr_to_send = intCalloc_dist(procs)) ) + ABORT("Malloc fails for ptr_to_send[]."); + if ( maxnnzToRecv ) { /* count can be zero */ + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) + ABORT("Malloc fails for itemp[]."); + if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) ) + ABORT("Malloc fails for dtemp[]."); + } + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + if ( !(*colptr = intCalloc_dist(n+1)) ) + ABORT("Malloc fails for *colptr[]."); + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + ++(*colptr)[jcol]; /* Count nonzeros in each column */ + } + } + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + NOTE: Can possibly use MPI_Alltoallv. + ------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_FLOAT, + p, iam+procs, grid->comm, &send_req[procs+p] ); + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { + it = 2*nnzToRecv[p]; + MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /*assert(jcol 0 ) { // cause two of the tests to hang + //if ( p != iam ) { + MPI_Wait( &send_req[p], &status); + MPI_Wait( &send_req[procs+p], &status); + } + } + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE + ------------------------------------------------------------*/ + + SUPERLU_FREE(nnzToRecv); + + if ( procs > 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + if ( SendCnt ) { + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + } + SUPERLU_FREE(ptr_to_send); + if ( maxnnzToRecv ) { + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + } + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. + ------------------------------------------------------------*/ + if ( nnz_loc ) { /* nnz_loc can be zero */ + if ( !(*rowind = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *rowind[]."); + if ( !(*a = floatMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *a[]."); + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = (*colptr)[0]; + (*colptr)[0] = 0; + for (j = 1; j < n; ++j) { + k += jsize; + jsize = (*colptr)[j]; + (*colptr)[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + j = ja[i]; + k = (*colptr)[j]; + (*rowind)[k] = ia[i]; + (*a)[k] = aij[i]; + ++(*colptr)[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; + (*colptr)[0] = 0; + + if ( nnz_loc ) { + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sReDistribute_A()"); +#endif + + return 0; +} /* sReDistribute_A */ + +float +psdistribute(fact_t fact, int_t n, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, sLUstruct_t *LUstruct, + gridinfo_t *grid) +/* + * -- Distributed SuperLU routine (version 2.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * March 15, 2003 + * + * + * Purpose + * ======= + * Distribute the matrix onto the 2D process mesh. + * + * Arguments + * ========= + * + * fact (input) fact_t + * Specifies whether or not the L and U structures will be re-used. + * = SamePattern_SameRowPerm: L and U structures are input, and + * unchanged on exit. + * = DOFACT or SamePattern: L and U structures are computed and output. + * + * n (input) int + * Dimension of the matrix. + * + * A (input) SuperMatrix* + * The distributed input matrix A of dimension (A->nrow, A->ncol). + * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: + * Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. + * + * ScalePermstruct (input) sScalePermstruct_t* + * The data structure to store the scaling and permutation vectors + * describing the transformations performed to the original matrix A. + * + * Glu_freeable (input) *Glu_freeable_t + * The global structure describing the graph of L and U. + * + * LUstruct (input) sLUstruct_t* + * Data structures for L and U factors. + * + * grid (input) gridinfo_t* + * The 2D process mesh. + * + * Return value + * ============ + * > 0, working storage required (in bytes). + * + */ +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + float *a; + int_t *asub, *xa; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ + + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + int tag; + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = (NRformat_loc *) A->Store; + +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + dword = sizeof(float); +//#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psdistribute()"); +#endif +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + sReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, + grid, &xa, &asub, &a); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf("--------\n" + ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); +#endif + + if ( fact == SamePattern_SameRowPerm ) { + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + Unnz = Llu->Unnz; + + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ +#if ( PROFlevel>=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + } /* for jb ... */ + + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); +#endif + + } else { /* fact is not SamePattern_SameRowPerm */ + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); + + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); + + mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; + + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); +#endif + + mem_use -= 2.0*k * iword; + + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); + + /* ------------------------------------------------ */ + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + if ( !(Unnz = + (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + ABORT("Malloc fails for Unnz[]."); + + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } /* for j ... */ + + jbrow = PROW( jb, grid ); + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb] + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = 0.0; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] + and Lnzval_bc_ptr[ljb] here. */ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + + } /* for jb ... */ + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s')); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + //////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + memTRS -= k*dword + grid->nprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); +#endif + + } /* else fact != SamePattern_SameRowPerm */ + + if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ + SUPERLU_FREE(asub); + SUPERLU_FREE(a); + } + SUPERLU_FREE(xa); + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit psdistribute()"); +#endif + + return (mem_use+memTRS); + +} /* PSDISTRIBUTE */ diff --git a/SRC/psgsequ.c b/SRC/psgsequ.c new file mode 100644 index 00000000..02bd4a0d --- /dev/null +++ b/SRC/psgsequ.c @@ -0,0 +1,244 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Computes row and column scalings + * + * File name: psgsequ.c + * History: Modified from LAPACK routine SGEEQU + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSGSEQU computes row and column scalings intended to equilibrate an
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make
+    the largest element in each row and column of the matrix B with
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+    number and BIGNUM = largest safe number.  Use of these scaling
+    factors is not guaranteed to reduce the condition number of A but
+    works well in practice.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (output) float*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors
+            for A.
+
+    C       (output) float*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+
+    ROWCND  (output) float*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and
+            AMAX is neither too large nor too small, it is not worth
+            scaling by R.
+
+    COLCND  (output) float*
+            If INFO = 0, COLCND contains the ratio of the smallest
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not
+            worth scaling by C.
+
+    AMAX    (output) float*
+            Absolute value of largest matrix element.  If AMAX is very
+            close to overflow or very close to underflow, the matrix
+            should be scaled.
+
+    INFO    (output) int*
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i,  and i is
+                  <= M:  the i-th row of A is exactly zero
+                  >  M:  the (i-M)-th column of A is exactly zero
+
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+    =====================================================================
+
+*/ + +void +psgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd, + float *colcnd, float *amax, int_t *info, gridinfo_t *grid) +{ + + /* Local variables */ + NRformat_loc *Astore; + float *Aval; + int i, j, irow, jcol, m_loc; + float rcmin, rcmax; + float bignum, smlnum; + float tempmax, tempmin; + float *loc_max; + int *r_sizes, *displs; + float *loc_r; + int_t procs; + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NR_loc || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + pxerr_dist("psgsequ", grid, i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + m_loc = Astore->m_loc; + + /* Get machine constants. */ + smlnum = smach_dist("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); + ++irow; + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + + /* Get the global MAX and MIN for R */ + tempmax = rcmax; + tempmin = rcmin; + MPI_Allreduce( &tempmax, &rcmax, + 1, MPI_FLOAT, MPI_MAX, grid->comm); + MPI_Allreduce( &tempmin, &rcmin, + 1, MPI_FLOAT, MPI_MIN, grid->comm); + + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); + } + ++irow; + } + + /* Find the global maximum for c[j] */ + if ( !(loc_max = floatMalloc_dist(A->ncol))) + ABORT("Malloc fails for loc_max[]."); + for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; + MPI_Allreduce(loc_max, c, A->ncol, MPI_FLOAT, MPI_MAX, grid->comm); + SUPERLU_FREE(loc_max); + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* gather R from each process to get the global R. */ + + procs = grid->nprow * grid->npcol; + if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) + ABORT("Malloc fails for r_sizes[]."); + displs = r_sizes + procs; + if ( !(loc_r = floatMalloc_dist(m_loc))) + ABORT("Malloc fails for loc_r[]."); + j = Astore->fst_row; + for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; + + /* First gather the size of each piece. */ + MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm); + + /* Set up the displacements for allgatherv */ + displs[0] = 0; + for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; + + /* Now gather the actual data */ + MPI_Allgatherv(loc_r, m_loc, MPI_FLOAT, r, r_sizes, displs, + MPI_FLOAT, grid->comm); + + SUPERLU_FREE(r_sizes); + SUPERLU_FREE(loc_r); + + return; + +} /* psgsequ */ diff --git a/SRC/psgsmv.c b/SRC/psgsmv.c new file mode 100644 index 00000000..b0478a9d --- /dev/null +++ b/SRC/psgsmv.c @@ -0,0 +1,383 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Parallel sparse matrix-vector multiplication + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +void psgsmv_init +( + SuperMatrix *A, /* Matrix A permuted by columns (input/output). + The type of A can be: + Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. */ + int_t *row_to_proc, /* Input. Mapping between rows and processes. */ + gridinfo_t *grid, /* Input */ + psgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ + ) +{ + NRformat_loc *Astore; + int iam, p, procs; + int *SendCounts, *RecvCounts; + int_t i, j, k, l, m, m_loc, n, fst_row, jcol; + int_t TotalIndSend, TotalValSend; + int_t *colind, *rowptr; + int_t *ind_tosend = NULL, *ind_torecv = NULL; + int_t *ptr_ind_tosend, *ptr_ind_torecv; + int_t *extern_start, *spa, *itemp; + float *nzval, *val_tosend = NULL, *val_torecv = NULL, t; + MPI_Request *send_req, *recv_req; + MPI_Status status; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psgsmv_init()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + m = A->nrow; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval = Astore->nzval; + if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for SendCounts[]"); + /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ + RecvCounts = SendCounts + procs; + if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) + ABORT("Malloc fails for ptr_ind_tosend[]"); + ptr_ind_torecv = ptr_ind_tosend + procs + 1; + if ( !(extern_start = intMalloc_dist(m_loc)) ) + ABORT("Malloc fails for extern_start[]"); + for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. + THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. + SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE + LOCAL PART OF X. + THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ + ABORT("Malloc fails for spa[]"); + for (p = 0; p < procs; ++p) SendCounts[p] = 0; + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = extern_start[i]; + for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ + jcol = colind[j]; + p = row_to_proc[jcol]; + if ( p != iam ) { /* External */ + if ( spa[jcol] == 0 ) { /* First time see this index */ + ++SendCounts[p]; + spa[jcol] = 1; + } + } else { /* Swap to beginning the part of A corresponding + to the local part of X */ + l = colind[k]; + t = nzval[k]; + colind[k] = jcol; + nzval[k] = nzval[j]; + colind[j] = l; + nzval[j] = t; + ++k; + } + } + extern_start[i] = k; + } + + /* ------------------------------------------------------------ + LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. + THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + /* Build pointers to ind_tosend[]. */ + ptr_ind_tosend[0] = 0; + for (p = 0, TotalIndSend = 0; p < procs; ++p) { + TotalIndSend += SendCounts[p]; /* Total to send. */ + ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; + } +#if 0 + ptr_ind_tosend[iam] = 0; /* Local part of X */ +#endif + if ( TotalIndSend ) { + if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) + ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ + } + + /* Build SPA to aid global to local translation. */ + for (i = 0; i < n; ++i) spa[i] = EMPTY; + for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + if ( spa[jcol] == EMPTY ) { /* First time see this index */ + p = row_to_proc[jcol]; + if ( p == iam ) { /* Local */ + /*assert(jcol>=fst_row);*/ + spa[jcol] = jcol - fst_row; /* Relative position in local X */ + } else { /* External */ + ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ + spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ + ++ptr_ind_tosend[p]; + } + } + } + } + + /* ------------------------------------------------------------ + TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. + THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + colind[j] = spa[jcol]; + } + } + + /* ------------------------------------------------------------ + COMMUNICATE THE EXTERNAL INDICES OF X. + ------------------------------------------------------------*/ + MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, + grid->comm); + + /* Build pointers to ind_torecv[]. */ + ptr_ind_torecv[0] = 0; + for (p = 0, TotalValSend = 0; p < procs; ++p) { + TotalValSend += RecvCounts[p]; /* Total to receive. */ + ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; + } + if ( TotalValSend ) { + if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) + ABORT("Malloc fails for ind_torecv[]"); + } + + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) + ABORT("Malloc fails for recv_req[]."); + recv_req = send_req + procs; + for (p = 0; p < procs; ++p) { + ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ + if ( SendCounts[p] ) { + MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], + mpi_int_t, p, iam, grid->comm, &send_req[p]); + } + if ( RecvCounts[p] ) { + MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], + mpi_int_t, p, p, grid->comm, &recv_req[p]); + } + } + for (p = 0; p < procs; ++p) { + if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Allocate storage for the X values to to transferred. */ + if ( TotalIndSend && + !(val_torecv = floatMalloc_dist(TotalIndSend)) ) + ABORT("Malloc fails for val_torecv[]."); + if ( TotalValSend && + !(val_tosend = floatMalloc_dist(TotalValSend)) ) + ABORT("Malloc fails for val_tosend[]."); + + gsmv_comm->extern_start = extern_start; + gsmv_comm->ind_tosend = ind_tosend; + gsmv_comm->ind_torecv = ind_torecv; + gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; + gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; + gsmv_comm->SendCounts = SendCounts; + gsmv_comm->RecvCounts = RecvCounts; + gsmv_comm->val_tosend = val_tosend; + gsmv_comm->val_torecv = val_torecv; + gsmv_comm->TotalIndSend = TotalIndSend; + gsmv_comm->TotalValSend = TotalValSend; + + SUPERLU_FREE(spa); + SUPERLU_FREE(send_req); + +#if ( DEBUGlevel>=2 ) + PrintInt10("psgsmv_init::rowptr", m_loc+1, rowptr); + PrintInt10("psgsmv_init::extern_start", m_loc, extern_start); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsmv_init()"); +#endif + +} /* PSGSMV_INIT */ + + +/* + * Performs sparse matrix-vector multiplication. + */ +void +psgsmv +( + int_t abs, /* Input. Do abs(A)*abs(x). */ + SuperMatrix *A_internal, /* Input. Matrix A permuted by columns. + The column indices are translated into + the relative positions in the gathered x-vector. + The type of A can be: + Stype = NR_loc; Dtype = SLU_S; Mtype = GE. */ + gridinfo_t *grid, /* Input */ + psgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */ + float x[], /* Input. The distributed source vector */ + float ax[] /* Output. The distributed destination vector */ +) +{ + NRformat_loc *Astore; + int iam, procs; + int_t i, j, p, m, m_loc, n, fst_row, jcol; + int_t *colind, *rowptr; + int *SendCounts, *RecvCounts; + int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv; + int_t *extern_start, TotalValSend; + float *nzval, *val_tosend, *val_torecv; + float zero = 0.0; + MPI_Request *send_req, *recv_req; + MPI_Status status; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psgsmv()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A_internal->Store; + m = A_internal->nrow; + n = A_internal->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval = (float *) Astore->nzval; + extern_start = gsmv_comm->extern_start; + ind_torecv = gsmv_comm->ind_torecv; + ptr_ind_tosend = gsmv_comm->ptr_ind_tosend; + ptr_ind_torecv = gsmv_comm->ptr_ind_torecv; + SendCounts = gsmv_comm->SendCounts; + RecvCounts = gsmv_comm->RecvCounts; + val_tosend = (float *) gsmv_comm->val_tosend; + val_torecv = (float *) gsmv_comm->val_torecv; + TotalValSend = gsmv_comm->TotalValSend; + + /* ------------------------------------------------------------ + COPY THE X VALUES INTO THE SEND BUFFER. + ------------------------------------------------------------*/ + for (i = 0; i < TotalValSend; ++i) { + j = ind_torecv[i] - fst_row; /* Relative index in x[] */ + val_tosend[i] = x[j]; + } + + /* ------------------------------------------------------------ + COMMUNICATE THE X VALUES. + ------------------------------------------------------------*/ + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) + ABORT("Malloc fails for recv_req[]."); + recv_req = send_req + procs; + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) { + MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p], + MPI_FLOAT, p, iam, + grid->comm, &send_req[p]); + } + if ( SendCounts[p] ) { + MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p], + MPI_FLOAT, p, p, + grid->comm, &recv_req[p]); + } + } + + /* ------------------------------------------------------------ + PERFORM THE ACTUAL MULTIPLICATION. + ------------------------------------------------------------*/ + if ( abs ) { /* Perform abs(A)*abs(x) */ + /* Multiply the local part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + ax[i] = 0.0; + for (j = rowptr[i]; j < extern_start[i]; ++j) { + jcol = colind[j]; + ax[i] += fabs(nzval[j]) * fabs(x[jcol]); + } + } + + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Multiply the external part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + for (j = extern_start[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]); + } + } + } else { + /* Multiply the local part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + ax[i] = zero; + for (j = rowptr[i]; j < extern_start[i]; ++j) { + jcol = colind[j]; + ax[i] += nzval[j] * x[jcol]; + } + } + + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Multiply the external part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + for (j = extern_start[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + ax[i] += nzval[j] * val_torecv[jcol]; + } + } + } + + SUPERLU_FREE(send_req); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsmv()"); +#endif + +} /* PSGSMV */ + +void psgsmv_finalize(psgsmv_comm_t *gsmv_comm) +{ + int_t *it; + float *dt; + SUPERLU_FREE(gsmv_comm->extern_start); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); + SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); + SUPERLU_FREE(gsmv_comm->SendCounts); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); +} + diff --git a/SRC/psgsmv_AXglobal.c b/SRC/psgsmv_AXglobal.c new file mode 100644 index 00000000..b81b3192 --- /dev/null +++ b/SRC/psgsmv_AXglobal.c @@ -0,0 +1,324 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs sparse matrix-vector multiplication + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + + +static void screate_msr_matrix(SuperMatrix *, int_t [], int_t, + float **, int_t **); +static void sPrintMSRmatrix(int, float [], int_t [], gridinfo_t *); + + +int psgsmv_AXglobal_setup +( + SuperMatrix *A, /* Matrix A permuted by columns (input). + The type of A can be: + Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */ + Glu_persist_t *Glu_persist, /* input */ + gridinfo_t *grid, /* input */ + int_t *m, /* output */ + int_t *update[], /* output */ + float *val[], /* output */ + int_t *bindx[], /* output */ + int_t *mv_sup_to_proc /* output */ + ) +{ + int n; + int input_option; + int N_update; /* Number of variables updated on this process (output) */ + int iam = grid->iam; + int nprocs = grid->nprow * grid->npcol; + int_t *xsup = Glu_persist->xsup; + int_t *supno = Glu_persist->supno; + int_t nsupers; + int i, nsup, p, t1, t2, t3; + + + /* Initialize the list of global indices. + * NOTE: the list of global indices must be in ascending order. + */ + n = A->nrow; + input_option = SUPER_LINEAR; + nsupers = supno[n-1] + 1; + +#if ( DEBUGlevel>=2 ) + if ( !iam ) { + PrintInt10("xsup", supno[n-1]+1, xsup); + PrintInt10("supno", n, supno); + } +#endif + + if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on + individual rows. */ + /* Figure out mv_sup_to_proc[] on all processes. */ + for (p = 0; p < nprocs; ++p) { + t1 = n / nprocs; /* Number of rows */ + t2 = n - t1 * nprocs; /* left-over, which will be assigned + to the first t2 processes. */ + if ( p >= t2 ) t2 += (p * t1); /* Starting row number */ + else { /* First t2 processes will get one more row. */ + ++t1; /* Number of rows. */ + t2 = p * t1; /* Starting row. */ + } + /* Make sure the starting and ending rows are at the + supernode boundaries. */ + t3 = t2 + t1; /* Ending row. */ + nsup = supno[t2]; + if ( t2 > xsup[nsup] ) { /* Round up the starting row. */ + t1 -= xsup[nsup+1] - t2; + t2 = xsup[nsup+1]; + } + nsup = supno[t3]; + if ( t3 > xsup[nsup] ) /* Round up the ending row. */ + t1 += xsup[nsup+1] - t3; + t3 = t2 + t1 - 1; + if ( t1 ) { + for (i = supno[t2]; i <= supno[t3]; ++i) { + mv_sup_to_proc[i] = p; +#if ( DEBUGlevel>=3 ) + if ( mv_sup_to_proc[i] == p-1 ) { + fprintf(stderr, + "mv_sup_to_proc conflicts at supno %d\n", i); + exit(-1); + } +#endif + } + } + + if ( iam == p ) { + N_update = t1; + if ( N_update ) { + if ( !(*update = intMalloc_dist(N_update)) ) + ABORT("Malloc fails for update[]"); + } + for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i; +#if ( DEBUGlevel>=3 ) + printf("(%2d) N_update = %4d\t" + "supers %4d to %4d\trows %4d to %4d\n", + iam, N_update, supno[t2], supno[t3], t2, t3); +#endif + } + } /* for p ... */ + } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on + individual supernodes. */ + /* This may cause bad load balance, because the blocks are usually + small in the beginning and large toward the end. */ + t1 = nsupers / nprocs; + t2 = nsupers - t1 * nprocs; /* left-over */ + if ( iam >= t2 ) t2 += (iam * t1); + else { + ++t1; /* Number of blocks. */ + t2 = iam * t1; /* Starting block. */ + } + N_update = xsup[t2+t1] - xsup[t2]; + if ( !(*update = intMalloc_dist(N_update)) ) + ABORT("Malloc fails for update[]"); + for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i; + } + + + /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */ + screate_msr_matrix(A, *update, N_update, val, bindx); + +#if ( DEBUGlevel>=2 ) + PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc); + sPrintMSRmatrix(N_update, *val, *bindx, grid); +#endif + + *m = N_update; + return 0; +} /* PSGSMV_AXglobal_SETUP */ + + +/*! \brief + * + *
+ * Create the distributed modified sparse row (MSR) matrix: bindx/val.
+ * For a submatrix of size m-by-n, the MSR arrays are as follows:
+ *    bindx[0]      = m + 1
+ *    bindx[0..m]   = pointer to start of each row
+ *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
+ *                    where, ks = bindx[k], ke = bindx[k+1]-1
+ *    val[k]        = A(k,k), k < m, diagonal elements
+ *    val[m]        = not used
+ *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
+ * Both arrays are of length nnz + 1.
+ * 
+*/ +static void screate_msr_matrix +( + SuperMatrix *A, /* Matrix A permuted by columns (input). + The type of A can be: + Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */ + int_t update[], /* input (local) */ + int_t N_update, /* input (local) */ + float **val, /* output */ + int_t **bindx /* output */ +) +{ + int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; + NCPformat *Astore; + float *nzval; + int_t *rowcnt; + double zero = 0.0; + + if ( !N_update ) return; + + n = A->ncol; + Astore = A->Store; + nzval = Astore->nzval; + + /* One pass of original matrix A to count nonzeros of each row. */ + if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) + ABORT("Malloc fails for rowcnt[]"); + lo = update[0]; + hi = update[N_update-1]; + nnz_local = 0; + nnz_diag = 0; + for (j = 0; j < n; ++j) { + for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { + irow = Astore->rowind[i]; + if ( irow >= lo && irow <= hi ) { + if ( irow != j ) /* Exclude diagonal */ + ++rowcnt[irow - lo]; + else ++nnz_diag; /* Count nonzero diagonal entries */ + ++nnz_local; + } + } + } + + /* Add room for the logical diagonal zeros which are not counted + in nnz_local. */ + nnz_local += (N_update - nnz_diag); + + /* Allocate storage for bindx[] and val[]. */ + if ( !(*val = (float *) floatMalloc_dist(nnz_local+1)) ) + ABORT("Malloc fails for val[]"); + for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ + if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) + ABORT("Malloc fails for bindx[]"); + + /* Set up row pointers. */ + (*bindx)[0] = N_update + 1; + for (j = 1; j <= N_update; ++j) { + (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; + rowcnt[j-1] = (*bindx)[j-1]; + } + + /* One pass of original matrix A to fill in matrix entries. */ + for (j = 0; j < n; ++j) { + for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { + irow = Astore->rowind[i]; + if ( irow >= lo && irow <= hi ) { + if ( irow == j ) /* Diagonal */ + (*val)[irow - lo] = nzval[i]; + else { + irow -= lo; + k = rowcnt[irow]; + (*bindx)[k] = j; + (*val)[k] = nzval[i]; + ++rowcnt[irow]; + } + } + } + } + + SUPERLU_FREE(rowcnt); +} + +/*! \brief + * + *
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ * 
+ */ +int +psgsmv_AXglobal(int_t m, int_t update[], float val[], int_t bindx[], + float X[], float ax[]) +{ + int_t i, j, k; + + if ( m <= 0 ) return 0; /* number of rows (local) */ + + for (i = 0; i < m; ++i) { + ax[i] = 0.0; + + for (k = bindx[i]; k < bindx[i+1]; ++k) { + j = bindx[k]; /* column index */ + ax[i] += val[k] * X[j]; + } + ax[i] += val[i] * X[update[i]]; /* diagonal */ + } + return 0; +} /* PSGSMV_AXglobal */ + +/* + * Performs sparse matrix-vector multiplication. + * - val/bindx stores the distributed MSR matrix A + * - X is global + * - ax product is distributed the same way as A + */ +int +psgsmv_AXglobal_abs(int_t m, int_t update[], float val[], int_t bindx[], + float X[], float ax[]) +{ + int_t i, j, k; + + if ( m <= 0 ) return 0; /* number of rows (local) */ + + for (i = 0; i < m; ++i) { + ax[i] = 0.0; + for (k = bindx[i]; k < bindx[i+1]; ++k) { + j = bindx[k]; /* column index */ + ax[i] += fabs(val[k]) * fabs(X[j]); + } + ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */ + } + + return 0; +} /* PSGSMV_AXglobal_ABS */ + +/* + * Print the local MSR matrix + */ +static void sPrintMSRmatrix +( + int m, /* Number of rows of the submatrix. */ + float val[], + int_t bindx[], + gridinfo_t *grid +) +{ + int iam, nnzp1; + + if ( !m ) return; + + iam = grid->iam; + nnzp1 = bindx[m]; + printf("(%2d) MSR submatrix has %d rows -->\n", iam, m); + Printfloat5("val", nnzp1, val); + PrintInt10("bindx", nnzp1, bindx); +} diff --git a/SRC/psgsrfs.c b/SRC/psgsrfs.c new file mode 100644 index 00000000..db734585 --- /dev/null +++ b/SRC/psgsrfs.c @@ -0,0 +1,260 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Last modified:
+ * December 31, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSRFS improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (local)
+ *        The m_loc-by-NRHS right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (local)
+ *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
+ *            transformed system A1*Y = Pc*Pr*B. where
+ *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
+ *        On exit, the improved solution matrix Y.
+ *
+ *        In order to obtain the solution X to the original system,
+ *        Y should be permutated by Pc^T, and premultiplied by diag(C)
+ *        if DiagScale = COL or BOTH.
+ *        This must be done after this routine is called.
+ *
+ * ldx    (input) int (local)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (output) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * berr   (output) float*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * 
+ */ +void +psgsrfs(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct, + sScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, + float *B, int_t ldb, float *X, int_t ldx, int nrhs, + sSOLVEstruct_t *SOLVEstruct, + float *berr, SuperLUStat_t *stat, int *info) +{ +#define ITMAX 20 + + float *ax, *R, *dx, *temp, *work, *B_col, *X_col; + int_t count, i, j, lwork, nz; + int iam; + float eps, lstres; + float s, safmin, safe1, safe2; + + /* Data structures used by matrix-vector multiply routine. */ + psgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; + NRformat_loc *Astore; + int_t m_loc, fst_row; + + + /* Initialization. */ + Astore = (NRformat_loc *) A->Store; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + iam = grid->iam; + + /* Test the input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; + else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; + else if ( nrhs < 0 ) *info = -13; + if (*info != 0) { + i = -(*info); + pxerr_dist("PSGSRFS", grid, i); + return; + } + + /* Quick return if possible. */ + if ( n == 0 || nrhs == 0 ) { + return; + } + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgsrfs()"); +#endif + + lwork = 2 * m_loc; /* For ax/R/dx and temp */ + if ( !(work = floatMalloc_dist(lwork)) ) + ABORT("Malloc fails for work[]"); + ax = R = dx = work; + temp = ax + m_loc; + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = smach_dist("Epsilon"); + safmin = smach_dist("Safe minimum"); + + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", + eps, anorm, safe1, safe2); +#endif + + /* Do for each right-hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + B_col = &B[j*ldb]; + X_col = &X[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + + /* Matrix-vector multiply. */ + psgsmv(0, A, grid, gsmv_comm, X_col, ax); + + /* Compute residual, stored in R[]. */ + for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i]; + + /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ + psgsmv(1, A, grid, gsmv_comm, X_col, temp); + for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]); + + s = 0.0; + for (i = 0; i < m_loc; ++i) { + if ( temp[i] > safe2 ) { + s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); + } else if ( temp[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]); + } + /* If temp[i] is exactly 0.0 (computed by PxGSMV), then + we know the true residual also must be exactly 0.0. */ + } + MPI_Allreduce( &s, &berr[j], 1, MPI_FLOAT, MPI_MAX, grid->comm ); + +#if ( PRNTlevel>= 1 ) + if ( !iam ) + printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); +#endif + if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { + /* Compute new dx. */ + psgstrs(n, LUstruct, ScalePermstruct, grid, + dx, m_loc, fst_row, m_loc, 1, + SOLVEstruct, stat, info); + + /* Update solution. */ + for (i = 0; i < m_loc; ++i) X_col[i] += dx[i]; + + lstres = berr[j]; + ++count; + } else { + break; + } + } /* end while */ + + stat->RefineSteps = count; + + } /* for j ... */ + + /* Deallocate storage. */ + SUPERLU_FREE(work); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsrfs()"); +#endif + +} /* PSGSRFS */ + diff --git a/SRC/psgsrfs_ABXglobal.c b/SRC/psgsrfs_ABXglobal.c new file mode 100644 index 00000000..989f591d --- /dev/null +++ b/SRC/psgsrfs_ABXglobal.c @@ -0,0 +1,465 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Improves the computed solution and provies error bounds + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015  version 4.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*-- Function prototypes --*/ +static void gather_1rhs_diag_to_all(int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t, int_t [], + int_t [], float [], float []); +static void redist_all_to_diag(int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t [], float []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgsrfs_ABXglobal improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
+ *        are permutation matrices. The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ *        NOTE: Currently, A must reside in all processes when calling
+ *              this routine.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (global)
+ *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *
+ *        NOTE: Currently, B must reside on all processes when calling
+ *              this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (global)
+ *        On entry, the solution matrix X, as computed by PSGSTRS.
+ *        On exit, the improved solution matrix X.
+ *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
+ *        in order to obtain the solution to the original system.
+ *
+ *        NOTE: Currently, X must reside on all processes when calling
+ *              this routine.
+ *
+ * ldx    (input) int (global)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * 
+ */ + +void +psgsrfs_ABXglobal(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct, + gridinfo_t *grid, float *B, int_t ldb, float *X, int_t ldx, + int nrhs, float *berr, SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 20 + + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + /* + * Data structures used by matrix-vector multiply routine. + */ + int_t N_update; /* Number of variables updated on this process */ + int_t *update; /* vector elements (global index) updated + on this processor. */ + int_t *bindx; + float *val; + int_t *mv_sup_to_proc; /* Supernode to process mapping in + matrix-vector multiply. */ + /*-- end data structures for matrix-vector multiply --*/ + float *b, *ax, *R, *B_col, *temp, *work, *X_col, + *x_trs, *dx_trs; + int_t count, ii, j, jj, k, knsupc, lk, lwork, + nprow, nsupers, nz, p; + int i, iam, pkk; + int_t *ilsum, *xsup; + double eps, lstres; + double s, safmin, safe1, safe2; + + /* NEW STUFF */ + int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ + int_t *diag_len; /* Length of the X vector on diagonal processes. */ + + /*-- Function prototypes --*/ + extern void psgstrs1(int_t, sLUstruct_t *, gridinfo_t *, + float *, int, SuperLUStat_t *, int *); + + /* Test the input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NCP || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; + else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; + else if ( nrhs < 0 ) *info = -13; + if (*info != 0) { + i = -(*info); + pxerr_dist("psgsrfs_ABXglobal", grid, i); + return; + } + + /* Quick return if possible. */ + if ( n == 0 || nrhs == 0 ) { + return; + } + + /* Initialization. */ + iam = grid->iam; + nprow = grid->nprow; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgsrfs_ABXglobal()"); +#endif + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. number of diag processes = " IFMT "\n", num_diag_procs); + PrintInt10("diag_procs", num_diag_procs, diag_procs); + PrintInt10("diag_len", num_diag_procs, diag_len); + } +#endif + + if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) + ABORT("Calloc fails for mv_sup_to_proc[]"); + + psgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, + &val, &bindx, mv_sup_to_proc); + + i = CEILING( nsupers, nprow ); /* Number of local block rows */ + ii = Llu->ldalsum + i * XK_H; + k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); + jj = SUPERLU_MAX( jj, N_update ); + lwork = N_update /* For ax and R */ + + ii /* For dx_trs */ + + ii /* For x_trs */ + + k /* For b */ + + jj; /* for temp */ + if ( !(work = floatMalloc_dist(lwork)) ) + ABORT("Malloc fails for work[]"); + ax = R = work; + dx_trs = work + N_update; + x_trs = dx_trs + ii; + b = x_trs + ii; + temp = b + k; + +#if ( DEBUGlevel>=2 ) + { + float *dwork = floatMalloc_dist(n); + for (i = 0; i < n; ++i) { + if ( i & 1 ) dwork[i] = 1.; + else dwork[i] = 2.; + } + /* Check correctness of matrix-vector multiply. */ + psgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); + Printfloat5("Mult A*x", N_update, ax); + SUPERLU_FREE(dwork); + } +#endif + + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = smach_dist("Epsilon"); + safmin = smach_dist("Safe minimum"); + + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", + eps, anorm, safe1, safe2); +#endif + + /* Do for each right-hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + + /* Copy X into x on the diagonal processes. */ + B_col = &B[j*ldb]; + X_col = &X[j*ldx]; + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + jj = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; + dx_trs[ii-XK_H] = k;/* Block number prepended in header. */ + } + } + } + /* Copy B into b distributed the same way as matrix-vector product. */ + if ( N_update ) ii = update[0]; + for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + + /* Matrix-vector multiply. */ + psgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); + + /* Compute residual. */ + for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i]; + + /* Compute abs(op(A))*abs(X) + abs(B). */ + psgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp); + for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]); + + s = 0.0; + for (i = 0; i < N_update; ++i) { + if ( temp[i] > safe2 ) { + s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); + } else if ( temp[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]); + } + /* If temp[i] is exactly 0.0 (computed by PxGSMV), then + we know the true residual also must be exactly 0.0. */ + } + MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); + +#if ( PRNTlevel>= 1 ) + if ( !iam ) + printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); +#endif + if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { + /* Compute new dx. */ + redist_all_to_diag(n, R, Glu_persist, Llu, grid, + mv_sup_to_proc, dx_trs); + psgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); + + /* Update solution. */ + for (p = 0; p < num_diag_procs; ++p) + if ( iam == diag_procs[p] ) + for (k = p; k < nsupers; k += num_diag_procs) { + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + knsupc = SuperSize( k ); + for (i = 0; i < knsupc; ++i) + x_trs[i + ii] += dx_trs[i + ii]; + } + lstres = berr[j]; + ++count; + /* Transfer x_trs (on diagonal processes) into X + (on all processes). */ + gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, + num_diag_procs, diag_procs, diag_len, + X_col, temp); + } else { + break; + } + } /* end while */ + + stat->RefineSteps = count; + + } /* for j ... */ + + + /* Deallocate storage used by matrix-vector multiplication. */ + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + if ( N_update ) { + SUPERLU_FREE(update); + SUPERLU_FREE(bindx); + SUPERLU_FREE(val); + } + SUPERLU_FREE(mv_sup_to_proc); + SUPERLU_FREE(work); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsrfs_ABXglobal()"); +#endif + +} /* PSGSRFS_ABXGLOBAL */ + + +/*! \brief + * + *
+ * r[] is the residual vector distributed the same way as
+ * matrix-vector product.
+ * 
+ */ +static void +redist_all_to_diag(int_t n, float r[], Glu_persist_t *Glu_persist, + sLocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], + float work[]) +{ + int_t i, ii, k, lk, lr, nsupers; + int_t *ilsum, *xsup; + int iam, knsupc, psrc, pkk; + MPI_Status status; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + lr = 0; + + for (k = 0; k < nsupers; ++k) { + pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); + psrc = mv_sup_to_proc[k]; + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + if ( iam == psrc ) { + if ( iam != pkk ) { /* Send X component. */ + MPI_Send( &r[lr], knsupc, MPI_FLOAT, pkk, Xk, + grid->comm ); + } else { /* Local copy. */ + for (i = 0; i < knsupc; ++i) + work[i + ii] = r[i + lr]; + } + lr += knsupc; + } else { + if ( iam == pkk ) { /* Recv X component. */ + MPI_Recv( &work[ii], knsupc, MPI_FLOAT, psrc, Xk, + grid->comm, &status ); + } + } + } +} /* REDIST_ALL_TO_DIAG */ + + +/*! \brief + * + *
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * 
+ */ +static void +gather_1rhs_diag_to_all(int_t n, float x[], + Glu_persist_t *Glu_persist, sLocalLU_t *Llu, + gridinfo_t *grid, int_t num_diag_procs, + int_t diag_procs[], int_t diag_len[], + float y[], float work[]) +{ + int_t i, ii, k, lk, lwork, nsupers, p; + int_t *ilsum, *xsup; + int iam, knsupc, pkk; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy x vector into a buffer. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; + lwork += knsupc; + } + MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( work, diag_len[p], MPI_FLOAT, pkk, grid->comm ); + } + /* Scatter work[] into global y vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + ii = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; + lwork += knsupc; + } + } +} /* GATHER_1RHS_DIAG_TO_ALL */ + diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c new file mode 100644 index 00000000..73020a02 --- /dev/null +++ b/SRC/psgssvx.c @@ -0,0 +1,1579 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B + * + *
+ * -- Distributed SuperLU routine (version 6.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * October  1, 2014
+ * April 5, 2015
+ * December 31, 2015  version 4.3
+ * December 31, 2016  version 5.1.3
+ * April 10, 2018  version 5.3
+ * September 18, 2018  version 6.0
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinement is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be initialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * 
+ */ + +void +psgssvx(superlu_dist_options_t *options, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo_t *grid, + sLUstruct_t *LUstruct, sSOLVEstruct_t *SOLVEstruct, float *berr, + SuperLUStat_t *stat, int *info) +{ + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + float *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + float *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR*/ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam,iam_g; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use = 0.0; /* memory usage by global A */ + float dist_mem_use = 0.0; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + int64_t nnzLU; + int_t nnz_tot; + float *nzval_a; + float asum,asum_tot,lsum,lsum_tot; + int_t nsupers,nsupers_j; + int_t lk,k,knsupc,nsupr; + int_t *lsub,*xsup; + float *lusup; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + LUstruct->dt = 's'; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep, parSymbFact; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + /* Initialization. */ + m = A->nrow; + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (float *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + num_mem_usage.for_lu = num_mem_usage.total = 0.0; + symb_mem_usage.for_lu = symb_mem_usage.total = 0.0; + + /* Test the input parameters. */ + *info = 0; + Fact = options->Fact; + if ( Fact < DOFACT || Fact > FACTORED ) + *info = -1; + else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR ) + *info = -1; + else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC ) + *info = -1; + else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA ) + *info = -1; + else if ( options->IterRefine == SLU_EXTRA ) { + *info = -1; + printf("ERROR: Extra precise iterative refinement yet to support.\n"); + } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < m_loc ) + *info = -5; + else if ( nrhs < 0 ) + *info = -6; + if ( sp_ienv_dist(2) > sp_ienv_dist(3) ) { + *info = 1; + printf("ERROR: Relaxation (NREL) cannot be larger than max. supernode size (NSUP).\n" + "\t-> Check parameter setting in sp_ienv_dist.c to correct error.\n"); + } + if ( *info ) { + i = -(*info); + pxerr_dist("psgssvx", grid, -*info); + return; + } + + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + parSymbFact = options->ParSymbFact; + + iam = grid->iam; + job = 5; + if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } else rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgssvx()"); +#endif + + /* Not factored & ask for equilibration */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + /* Allocate storage if not done so before. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + * Diagonal scaling to equilibrate the matrix. (simple scheme) + * for row i = 1:n, A(i,:) <- A(i,:) / max(abs(A(i,:)); + * for column j = 1:n, A(:,j) <- A(:, j) / max(abs(A(:,j)) + * ------------------------------------------------------------*/ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter equil"); +#endif + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm ) { + /* Reuse R and C. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j+1]; ++i){ + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + psgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pslaqgs(A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_() - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if ( !factored ) { /* Skip this if already factored. */ + /* + * For serial symbolic factorization, gather A from the distributed + * compressed row format to global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if ( Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO) ) { + /* Performs serial symbolic factorzation and/or MC64 */ + + need_value = (options->RowPerm == LargeDiag_MC64); + + psCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof(int_t); + + if ( need_value ) { + a_GA = (float *) GAstore->nzval; + GA_mem_use += nnz * sizeof(float); + } else assert(GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation Pr for A, and apply Pr*[GA]. + GA is overwritten by Pr*[GA]. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + t = SuperLU_timer_(); + if ( Fact != SamePattern_SameRowPerm ) { + if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] from MC64 */ + if ( job == 5 ) { + /* Allocate storage for scaling factors. */ + if ( !(R1 = floatMalloc_dist(m)) ) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if ( !(C1 = floatMalloc_dist(n)) ) + ABORT("SUPERLU_MALLOC fails for C1[]"); + } + + if ( !iam ) { /* Process 0 finds a row permutation */ + iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = smach_dist("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if ( job == 5 ) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if ( perm_r[irow] == icol ) { /* New diagonal */ + if ( job == 2 || job == 3 ) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if ( job == 4 ) + dsum += fabs(a[i]); + else if ( job == 5 ) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; + else for (i = 0; i < m; ++i) R[i] = R1[i]; + if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; + else for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end Equil */ + + /* Now permute global GA to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( PRNTlevel>=2 ) + if ( job == 2 || job == 3 ) { + if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); + } else if ( job == 4 ) { + if ( !iam ) printf("\tsum of diagonal %e\n", dsum); + } else if ( job == 5 ) { + if ( !iam ) printf("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + s_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if options->RowPerm ... */ + + t = SuperLU_timer_() - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact ... */ + + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) PrintInt10("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if ( notran ) *(unsigned char *)norm = '1'; + else *(unsigned char *)norm = 'I'; + anorm = pslangs(norm, A, grid); +#if ( PRNTlevel>=1 ) + if ( !iam ) { printf(".. anorm %e\n", anorm); fflush(stdout); } +#endif + } + + /* ------------------------------------------------------------ + Perform the LU factorization: symbolic factorization, + redistribution, and numerical factorization. + ------------------------------------------------------------*/ + if ( !factored ) { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if ( parSymbFact == YES || permc_spec == PARMETIS ) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) col = 0; + else col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm ); + + if ( permc_spec == NATURAL || permc_spec == MY_PERMC ) { + if ( permc_spec == NATURAL ) { + for (j = 0; j < n; ++j) perm_c[j] = j; + } + if ( !(sizes = intMalloc_dist(2 * noDomains)) ) + ABORT("SUPERLU_MALLOC fails for sizes."); + if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) ) + ABORT("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2*noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2*noDomains - 2] = m; + fstVtxSep[2*noDomains - 2] = 0; + } else if ( permc_spec != PARMETIS ) { /* same as before */ + printf("{" IFMT "," IFMT "}: psgssvx: invalid ColPerm option when ParSymbfact is used\n", + MYROW(grid->iam, grid), MYCOL(grid->iam, grid)); + } + } + + if ( permc_spec != MY_PERMC && Fact == DOFACT ) { + /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */ + if ( permc_spec == PARMETIS ) { + // #pragma omp parallel + // { + // #pragma omp master + // { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + // } + // } + if (flinfo > 0) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n"); +#endif + *info = flinfo; + return; + } + } else { + get_perm_c_dist(iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Symbolic factorization. */ + if ( Fact != SamePattern_SameRowPerm ) { + if ( parSymbFact == NO ) { /* Perform serial symbolic factorization */ + /* GA = Pr*A, perm_r[] is already applied. */ + int_t *GACcolbeg, *GACcolend, *GACrowind; + + /* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. + After this routine, GAC = GA*Pc^T. */ + sp_colorder(options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); + fflush(stdout); + } +#endif + t = SuperLU_timer_(); + if ( !(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) + ABORT("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact(options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + nnzLU = Glu_freeable->nnzLU; + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if ( iinfo <= 0 ) { /* Successful return */ + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); + printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); + printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu*1e-6, + symb_mem_usage.total*1e-6, + symb_mem_usage.expansions); + fflush(stdout); + } +#endif + } else { /* symbfact out of memory */ +#if ( PRNTlevel>=1 ) + if ( !iam ) + fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo); +#endif + *info = iinfo; + return; + } + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_(); + flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + nnzLU = Pslu_freeable.nnzLU; + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (flinfo > 0) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "Insufficient memory for parallel symbolic factorization."); +#endif + *info = flinfo; + return; + } + } + + /* Destroy global GA */ + if ( parSymbFact == NO || options->RowPerm != NO ) + Destroy_CompCol_Matrix_dist(&GA); + if ( parSymbFact == NO ) + Destroy_CompCol_Permuted_dist(&GAC); + + } /* end if Fact != SamePattern_SameRowPerm ... */ + + if (sizes) SUPERLU_FREE (sizes); + if (fstVtxSep) SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); + + /* Distribute entries of A into L & U data structures. */ + //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { + if ( parSymbFact == NO ) { + /* CASE OF SERIAL SYMBOLIC */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_(); + dist_mem_use = psdistribute(Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factorization. */ + if ( Fact != SamePattern_SameRowPerm ) { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + } else { /* CASE OF PARALLEL SYMBOLIC */ + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_(); + dist_mem_use = sdist_psymbtonum(Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_() - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]);*/ + + /* Perform numerical factorization in parallel. */ + t = SuperLU_timer_(); + // #pragma omp parallel + // { + // #pragma omp master + // { + psgstrf(options, m, n, anorm, LUstruct, grid, stat, info); + stat->utime[FACT] = SuperLU_timer_() - t; + // } + // } + + +#if ( PRNTlevel>=2 ) + /* ------------------------------------------------------------ + SUM OVER ALL ENTRIES OF A AND PRINT NNZ AND SIZE OF A. + ------------------------------------------------------------*/ + Astore = (NRformat_loc *) A->Store; + xsup = Glu_persist->xsup; + nzval_a = Astore->nzval; + + + asum=0; + for (i = 0; i < Astore->m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + asum += nzval_a[j]; + } + } + + nsupers = Glu_persist->supno[n-1] + 1; + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + + + lsum=0.0; + for (lk=0;lkLlu->Lrowind_bc_ptr[lk]; + lusup = LUstruct->Llu->Lnzval_bc_ptr[lk]; + if(lsub){ + k = MYCOL(grid->iam, grid)+lk*grid->npcol; /* not sure */ + knsupc = SuperSize( k ); + nsupr = lsub[1]; + for (j=0; jcomm ); + MPI_Allreduce( &lsum, &lsum_tot,1, MPI_FLOAT, MPI_SUM, grid->comm ); + + + MPI_Allreduce( &Astore->rowptr[Astore->m_loc], &nnz_tot,1, mpi_int_t, MPI_SUM, grid->comm ); + // MPI_Bcast( &nnzLU, 1, mpi_int_t, 0, grid->comm ); + + MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); + + printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz " IFMT " nnzLU %ld sum %e lsum %e N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol); + fflush(stdout); +#endif + +#if 0 + +// #ifdef GPU_PROF + +// if(!iam ) +// { +// char* ttemp; + +// ttemp = getenv("IO_FILE"); +// if(ttemp!=NULL) +// { +// printf("File being opend is %s\n",ttemp ); +// FILE* fp; +// fp = fopen(ttemp,"w"); +// if(!fp) +// { +// fprintf(stderr," Couldn't open output file %s\n",ttemp); +// } + +// int nsup=Glu_persist->supno[n-1]+1; +// int ii; +// for (ii = 0; ii < nsup; ++ii) +// { +// fprintf(fp,"%d,%d,%d,%d,%d,%d\n",gs1.mnk_min_stats[ii],gs1.mnk_min_stats[ii+nsup], +// gs1.mnk_min_stats[ii+2*nsup], +// gs1.mnk_max_stats[ii],gs1.mnk_max_stats[ii+nsup],gs1.mnk_max_stats[ii+2*nsup]); +// } + +// // lastly put the timeing stats that we need + +// fprintf(fp,"Min %lf Max %lf totaltime %lf \n",gs1.osDgemmMin,gs1.osDgemmMax,stat->utime[FACT]); +// fclose(fp); +// } + +// } +// #endif + +#endif + + if ( options->PrintStat ) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical + factorization */ + temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use); + if ( options->RowPerm != NO ) + temp = SUPERLU_MAX(temp, GA_mem_use); + } else { + temp = SUPERLU_MAX ( + symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX(temp, num_mem_usage.total); + + MPI_Reduce( &temp, &max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm ); + stat->TinyPivots = TinyPivots; + + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + + if (!iam) { + printf("\n** Memory Usage **********************************\n"); + printf("** NUMfact space (MB): (sum-of-all-processes)\n" + " L\\U : %8.2f | Total : %8.2f\n", + for_lu * 1e-6, total * 1e-6); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + max * 1e-6); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); + fflush(stdout); + } + } /* end printing stats */ + + } /* end if (!factored) */ + + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + sSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + options->SolveInitialized = NO; /* Reset the solve state */ + } + } +#if 0 + /* Need to revisit: Why the following is not good enough for X-to-B + distribution -- inv_perm_c changed */ + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct); +#endif + + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------*/ + if ( nrhs && *info == 0 ) { + + if ( !(b_work = floatMalloc_dist(n)) ) + ABORT("Malloc fails for b_work[]"); + + /* ------------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------------*/ + if ( notran ) { + if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } else if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) ) + ABORT("Malloc fails for X[]"); + x_col = X; b_col = B; + for (j = 0; j < nrhs; ++j) { +#if 0 /* Sherry */ + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; +#endif + memcpy(x_col, b_col, m_loc * sizeof(float)); + x_col += ldx; b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------*/ + if ( options->SolveInitialized == NO ) { /* First time */ + sSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, + SOLVEstruct); + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to psgssvx(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + } + + if ( options->DiagInv==YES && + (options->SolveInitialized == NO || Fact == SamePattern || + Fact == SamePattern_SameRowPerm) ) { + psCompute_Diag_Inv(n, LUstruct, grid, stat, info); + } + + + // #pragma omp parallel + // { + // #pragma omp master + // { + psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + // } + // } + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------*/ + if ( options->IterRefine ) { + /* Improve the solution by iterative refinement. */ + int_t *it; + int_t *colind_gsmv = SOLVEstruct->A_colind_gsmv; + /* This was allocated and set to NULL in sSolveInit() */ + sSOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + + t = SuperLU_timer_(); + if ( options->RefineInitialized == NO || Fact == DOFACT ) { + /* All these cases need to re-initialize gsmv structure */ + if ( options->RefineInitialized ) + psgsmv_finalize(SOLVEstruct->gsmv_comm); + psgsmv_init(A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); + if ( !(it = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } else if ( Fact == SamePattern || + Fact == SamePattern_SameRowPerm ) { + float atemp; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in psgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if ( p == iam ) { /* Local */ + atemp = a[k]; a[k] = a[j]; a[j] = atemp; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to psgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; + } + + if ( nrhs == 1 ) { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } else { /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if ( !(SOLVEstruct1 = (sSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(sSOLVEstruct_t))) ) + ABORT("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + ABORT("Malloc fails for gstrs_comm[]"); + psgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + psgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if ( nrhs > 1 ) { + pxgstrs_finalize(SOLVEstruct1->gstrs_comm); + SUPERLU_FREE(SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_() - t; + } /* end if IterRefine */ + + /* Permute the solution matrix B <= Pc'*X. */ + psPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf("\n (%d) .. After psPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before equilibration. */ + if ( notran ) { + if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } else if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE(b_work); + SUPERLU_FREE(X); + + } /* end if nrhs != 0 && *info == 0 */ + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it was not used. */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + SUPERLU_FREE(R); + SUPERLU_FREE(C); + break; + case ROW: + SUPERLU_FREE(C); + break; + case COL: + SUPERLU_FREE(R); + break; + default: break; + } + } + +#if 0 + if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist(&GAC); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgssvx()"); +#endif + +} diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c new file mode 100644 index 00000000..80bdd329 --- /dev/null +++ b/SRC/psgssvx3d.c @@ -0,0 +1,1589 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_sdefs.h"
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PSGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PSGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + sScalePermstruct_t * ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + sLUstruct_t * LUstruct, sSOLVEstruct_t * SOLVEstruct, + float *berr, SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore = A->Store; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + float *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + float *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + LUstruct->dt = 's'; + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < Astore->m_loc) + *info = -5; + else if (nrhs < 0) { + *info = -6; + } + if (*info) { + i = -(*info); + pxerr_dist ("psgssvx3d", grid, -(*info)); + return; + } + + /* Initialization. */ + + + options->Algo3d = YES; + + /* definition of factored seen by each process layer */ + factored = (Fact == FACTORED); + + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d, + so that the names {ldb, B, and Astore} can be used internally. + B3d and Astore3d will be assigned back to B and Astore on return.*/ + int ldb3d = ldb; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + NRformat_loc3d *A3d = SOLVEstruct->A3d; + + /* B3d is aliased to B; + B2d is allocated; + B is then aliased to B2d for the following 2D solve; + */ + sGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d, &A3d); + + B = (float *) A3d->B2d; /* B is now pointing to B2d, + allocated in dGatherNRformat_loc3d. */ + //PrintDouble5("after gather B=B2d", ldb, B); + + SOLVEstruct->A3d = A3d; /* This structure need to be persistent across + multiple calls of pdgssvx3d() */ + + NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 + NRformat_loc *A_orig = A->Store; +////// + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter psgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: + - equilibration, + - ordering, + - symbolic factorization, + - distribution of L & U */ + + if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; // on 2D grid-0 + ldb = Astore0->m_loc; + + /* The following code now works on 2D grid-0 */ + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (float *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + /* Extract equilibration status from a previous factorization */ + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else { + rowequ = colequ = FALSE; + } + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + if (!(R = (float *) floatMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (float *) floatMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (float *) floatMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (float *) floatMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + psgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pslaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if ( !factored ) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + psCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (float *) GAstore->nzval; + GA_mem_use += nnz * sizeof (float); + } + + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = floatMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = floatMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = sldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm); + MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm); + MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); + else if (job == 5) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + s_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pslangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if ( !factored ) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf("{%4d,%4d}: psgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = psdistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = sdist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_FLOAT, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + sp3dScatter(n, LUstruct, grid3d); + + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + trf3Dpartition = sinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + psgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + sgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + s3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + sDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); + + } /* end if not Factored ... factor on all process layers */ + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + sQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if ( (nrhs > 0) && (*info == 0) ) { + if (!(b_work = floatMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = floatMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to psgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + sSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + psgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + sSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + psgsmv_finalize (SOLVEstruct->gsmv_comm); + psgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + float at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (sSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(sSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + psgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + psgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + + /* Permute the solution matrix B <= Pc'*X. */ + psPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs > 0 and factor successful */ + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + + /* Scatter the solution from 2D grid-0 to 3D grid */ + if ( nrhs > 0 ) sScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit psgssvx3d()"); +#endif + +} diff --git a/SRC/psgssvx_ABglobal.c b/SRC/psgssvx_ABglobal.c new file mode 100644 index 00000000..f4d582a3 --- /dev/null +++ b/SRC/psgssvx_ABglobal.c @@ -0,0 +1,1112 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B, + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015   version 4.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgssvx_ABglobal solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ *
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has several options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *      -  A, the input matrix
+ *
+ *      as well as the following options, which are described in more
+ *      detail below:
+ *
+ *      -  options->Equil,   to specify how to scale the rows and columns
+ *                           of A to "equilibrate" it (to try to reduce its
+ *                           condition number and so improve the
+ *                           accuracy of the computed solution)
+ *
+ *      -  options->RowPerm, to specify how to permute the rows of A
+ *                           (typically to control numerical stability)
+ *
+ *      -  options->ColPerm, to specify how to permute the columns of A
+ *                           (typically to control fill-in and enhance
+ *                           parallelism during factorization)
+ *
+ *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                           pivots encountered during factorization
+ *                           (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *      -  ScalePermstruct,  modified to describe how the input matrix A
+ *                           was equilibrated and permuted:
+ *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                        columns of A were scaled
+ *         -  ScalePermstruct->R, array of row scale factors
+ *         -  ScalePermstruct->C, array of column scale factors
+ *         -  ScalePermstruct->perm_r, row permutation vector
+ *         -  ScalePermstruct->perm_c, column permutation vector
+ *
+ *            (part of ScalePermstruct may also need to be supplied on input,
+ *             depending on options->RowPerm and options->ColPerm as described
+ *             later).
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *                Pc*Pr*diag(R)*A*diag(C)
+ *             where
+ *                Pr and Pc are row and columns permutation matrices determined
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
+ *                  respectively, and
+ *                diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *      -  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
+ *               in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In this
+ *            case the algorithm saves time by reusing the previously computed
+ *            column permutation vector stored in ScalePermstruct->perm_c
+ *            and the "elimination tree" of A stored in LUstruct->etree.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->RowPerm
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->perm_c, the column permutation
+ *      -  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated and row permuted
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are ignored.
+ *      This is because the permutations from ScalePermstruct->perm_r and
+ *      ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
+ *                                     column scaled
+ *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                             if any
+ *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                  matrix
+ *      -  all of LUstruct, the previously computed information about L and U
+ *                (the actual numerical values of L and U stored in
+ *                 LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated
+ *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *      -  A, the unfactored matrix, only in the case that iterative refinement
+ *            is to be done (specifically A must be the output A from
+ *            the previous call, so that it has been scaled and permuted)
+ *      -  all of ScalePermstruct
+ *      -  all of LUstruct, including the actual numerical values of L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix*
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *         The number of linear equations is A->nrow. The type of A must be:
+ *         Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE. That is, A is stored in
+ *         compressed column format (also known as Harwell-Boeing format).
+ *         See supermatrix.h for the definition of 'SuperMatrix'.
+ *         This routine only handles square A, however, the LU factorization
+ *         routine psgstrf can factorize rectangular matrices.
+ *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
+ *         depending on ScalePermstruct->DiagScale, options->RowPerm and
+ *         options->colpem:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->RowPerm != NATURAL, A is further overwritten by
+ *                Pr*diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                Pc*Pr*diag(R)*A*diag(C).
+ *         If all the above condition are true, the LU decomposition is
+ *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ *         NOTE: Currently, A must reside in all processes when calling
+ *               this routine.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t*
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float*
+ *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
+ *         On exit, the solution matrix if info = 0;
+ *
+ *         NOTE: Currently, B must reside in all processes when calling
+ *               this routine.
+ *
+ * ldb     (input) int (global)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t*
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * berr    (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * 
+ */ +void +psgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo_t *grid, + sLUstruct_t *LUstruct, float *berr, + SuperLUStat_t *stat, int *info) +{ + SuperMatrix AC; + NCformat *Astore; + NCPformat *ACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by DDISTRIBUTE + routine. They will be freed after DDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + float *a; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *colptr, *rowind; + int_t Equil, factored, job, notran, colequ, rowequ; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; + int iam; + int ldx; /* LDA for matrix X (global). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + static superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + LUstruct->dt = 's'; + + /* Test input parameters. */ + *info = 0; + Fact = options->Fact; + if ( Fact < 0 || Fact > FACTORED ) + *info = -1; + else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) + *info = -1; + else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) + *info = -1; + else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA ) + *info = -1; + else if ( options->IterRefine == SLU_EXTRA ) { + *info = -1; + fprintf(stderr, "Extra precise iterative refinement yet to support."); + } else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < A->nrow ) + *info = -5; + else if ( nrhs < 0 ) + *info = -6; + if ( *info ) { + i = -(*info); + pxerr_dist("psgssvx_ABglobal", grid, -*info); + return; + } + + /* Initialization */ + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + iam = grid->iam; + job = 5; + m = A->nrow; + n = A->ncol; + Astore = A->Store; + nnz = Astore->nnz; + a = Astore->nzval; + colptr = Astore->colptr; + rowind = Astore->rowind; + if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } else rowequ = colequ = FALSE; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgssvx_ABglobal()"); +#endif + + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + if ( Equil && Fact != SamePattern_SameRowPerm ) { + /* Allocate storage if not done so before. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------*/ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter equil"); +#endif + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm ) { + /* Reuse R and C. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + break; + case ROW: + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow]; /* Scale rows. */ + } + } + break; + case COL: + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) + a[i] *= C[j]; /* Scale columns. */ + break; + case BOTH: + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow] * C[j]; /* Scale rows and columns. */ + } + } + break; + } + } else { + if ( !iam ) { + /* Compute row and column scalings to equilibrate matrix A. */ + sgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); + } else { + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", + iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", + iinfo-n); +#endif + } + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); + } + } + + if ( iinfo == 0 ) { + /* Equilibrate matrix A. */ + slaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + } + +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. equilibrated? *equed = %c\n", *equed); + /*fflush(stdout);*/ + } +#endif + } /* if Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_() - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit equil"); +#endif + } /* end if Equil ... */ + + /* ------------------------------------------------------------ + Permute rows of A. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ + || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( !factored ) { + if ( job == 5 ) { + /* Allocate storage for scaling factors. */ + if ( !(R1 = (float *) SUPERLU_MALLOC(m * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if ( !(C1 = (float *) SUPERLU_MALLOC(n * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for C1[]"); + } + + if ( !iam ) { + /* Process 0 finds a row permutation for large diagonal. */ + iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a, + perm_r, R1, C1); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } + + if ( iinfo && job == 5) { + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } + +#if ( PRNTlevel>=2 ) + dmin = smach_dist("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if ( job == 5 ) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); + } + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */ + rowind[i] = perm_r[irow]; +#if ( PRNTlevel>=2 ) + if ( rowind[i] == j ) /* New diagonal */ + dprod *= fabs(a[i]); +#endif + } + } + + /* Multiply together the scaling factors. */ + if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; + else for (i = 0; i < m; ++i) R[i] = R1[i]; + if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; + else for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + } else { /* No equilibration. */ + for (i = colptr[0]; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; +#if ( PRNTlevel>=2 ) + if ( rowind[i] == j ) { /* New diagonal */ + if ( job == 2 || job == 3 ) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if ( job == 4 ) + dsum += fabs(a[i]); + else if ( job == 5 ) + dprod *= fabs(a[i]); + } +#endif + } /* end for i ... */ + } /* end for j ... */ + } /* end else */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( PRNTlevel>=2 ) + if ( job == 2 || job == 3 ) { + if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); + } else if ( job == 4 ) { + if ( !iam ) printf("\tsum of diagonal %e\n", dsum); + } else if ( job == 5 ) { + if ( !iam ) printf("\t product of diagonal %e\n", dprod); + } +#endif + + } /* else !factored */ + + t = SuperLU_timer_() - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); +#endif + + } else { /* options->RowPerm == NOROWPERM */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if ( notran ) *(unsigned char *)norm = '1'; + else *(unsigned char *)norm = 'I'; + anorm = slangs_dist(norm, A); +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. anorm %e\n", anorm); +#endif + } + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------*/ + if ( !factored ) { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && Fact == DOFACT ) + /* Use an ordering provided by SuperLU */ + get_perm_c_dist(iam, permc_spec, A, perm_c); + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + sp_colorder(options, A, perm_c, etree, &AC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ + ACstore = AC.Store; + for (j = 0; j < n; ++j) + for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { + irow = ACstore->rowind[i]; + ACstore->rowind[i] = perm_c[irow]; + } + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Perform a symbolic factorization on matrix A and set up the + nonzero data structures which are suitable for supernodal GENP. */ + if ( Fact != SamePattern_SameRowPerm ) { +#if ( PRNTlevel>=1 ) + if ( !iam ) + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_(); + if ( !(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) + ABORT("Malloc fails for Glu_freeable."); + + iinfo = symbfact(options, iam, &AC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + + if ( iinfo <= 0 ) { + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); + printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); + printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); + printf("\tint %d, short %d, float %d, double %d\n", + (int) sizeof(int_t), (int) sizeof(short), + (int) sizeof(float), (int) sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu*1e-6, + symb_mem_usage.total*1e-6, + symb_mem_usage.expansions); + } +#endif + } else { /* symbfact out of memory */ +#if ( PRNTlevel>=1 ) + if ( !iam ) + fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo); +#endif + *info = iinfo; + return; + } + } + + /* Distribute the L and U factors onto the process grid. */ + t = SuperLU_timer_(); + dist_mem_use = sdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factor. */ + if ( Fact != SamePattern_SameRowPerm ) { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + + /* Perform numerical factorization in parallel. */ + t = SuperLU_timer_(); + psgstrf(options, m, n, anorm, LUstruct, grid, stat, info); + stat->utime[FACT] = SuperLU_timer_() - t; + +#if ( PRNTlevel>=1 ) + { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + temp = SUPERLU_MAX(symb_mem_usage.total, + symb_mem_usage.for_lu + + (float)dist_mem_use + num_mem_usage.for_lu); + temp = SUPERLU_MAX(temp, num_mem_usage.total); + MPI_Reduce( &temp, &max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm ); + stat->TinyPivots = TinyPivots; + if ( !iam ) { + printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", + for_lu*1e-6, total*1e-6); + printf("\tAll space (MB):" + "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", + avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); + printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); + printf(".. psgstrf INFO = %d\n", *info); + } + } +#endif + + } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ + /* Permute columns of A to form A*Pc' using the existing perm_c. + * NOTE: rows of A were previously permuted to Pc*A. + */ + sp_colorder(options, A, perm_c, NULL, &AC); + } /* if !factored ... */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------*/ + if ( nrhs && *info == 0 ) { + + if ( !(b_work = floatMalloc_dist(n)) ) + ABORT("Malloc fails for b_work[]"); + + /* ------------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------------*/ + if ( notran ) { + if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_col[i] *= R[i]; + b_col += ldb; + } + } + } else if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_col[i] *= C[i]; + b_col += ldb; + } + } + + /* ------------------------------------------------------------ + Permute the right-hand side to form Pr*B. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + if ( notran ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; + for (i = 0; i < m; ++i) b_col[i] = b_work[i]; + b_col += ldb; + } + } + } + + + /* ------------------------------------------------------------ + Permute the right-hand side to form Pc*B. + ------------------------------------------------------------*/ + if ( notran ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; + for (i = 0; i < m; ++i) b_col[i] = b_work[i]; + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) ) + ABORT("Malloc fails for X[]"); + x_col = X; b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; + x_col += ldx; b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------*/ + psgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------*/ + if ( options->IterRefine ) { + /* Improve the solution by iterative refinement. */ + t = SuperLU_timer_(); + psgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, + X, ldx, nrhs, berr, stat, info); + stat->utime[REFINE] = SuperLU_timer_() - t; + } + + /* Permute the solution matrix X <= Pc'*X. */ + for (j = 0; j < nrhs; j++) { + b_col = &B[j*ldb]; + x_col = &X[j*ldx]; + for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; + } + + /* Transform the solution matrix X to a solution of the original system + before the equilibration. */ + if ( notran ) { + if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < n; ++i) b_col[i] *= C[i]; + b_col += ldb; + } + } + } else if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < n; ++i) b_col[i] *= R[i]; + b_col += ldb; + } + } + + SUPERLU_FREE(b_work); + SUPERLU_FREE(X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it is not used. */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + SUPERLU_FREE(R); + SUPERLU_FREE(C); + break; + case ROW: + SUPERLU_FREE(C); + break; + case COL: + SUPERLU_FREE(R); + break; + default: break; + } + } + if ( !factored || (factored && options->IterRefine) ) + Destroy_CompCol_Permuted_dist(&AC); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgssvx_ABglobal()"); +#endif +} + diff --git a/SRC/psgstrf.c b/SRC/psgstrf.c new file mode 100644 index 00000000..092db241 --- /dev/null +++ b/SRC/psgstrf.c @@ -0,0 +1,2004 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs LU factorization in parallel. + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 1, 1999
+ *   Feburary 7, 2001  use MPI_Isend/MPI_Irecv
+ *   October 15, 2008  latency-reducing panel factorization
+ *   July    12, 2011  static scheduling and arbitrary look-ahead
+ *   March   13, 2013  change NTAGS to MPI_TAG_UB value
+ *   September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
+ *   December 31, 2015 rename xMACH to xMACH_DIST.
+ *   September 30, 2017 optimization for Intel Knights Landing (KNL) node .
+ *   June 1, 2018      add parallel AWPM pivoting; add back arrive_at_ublock()
+ *   February 8, 2019  version 6.1.1
+ *
+ * Sketch of the algorithm
+ *
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( psgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ *
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           psgstrf2(k), one column at a time
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j)
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i)
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ * 
+ */ + +#include +#include "superlu_sdefs.h" +#ifdef GPU_ACC +// #define NUM_GPU_STREAMS 16 +// #define NUM_GPU_STREAMS 16 +#include "gpublas_utils.h" +#endif + +/* Various defininations */ +/* + Name : SUPERNODE_PROFILE + Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec + obtained,bandwidth achieved: + Overhead : Low +*/ +// #define SUPERNODE_PROFILE + +/* + Name : BASELINE + Purpose : baseline to compare performance against + Overhead : NA : this won't be used for running experiments +*/ +// #define BASELINE + +/* + Name : PHI_FRAMEWORK + Purpose : To simulate and test algorithm used for offloading Phi + Overhead : NA : this won't be used for running experiments +*/ +#define PHI_FRAMEWORK + +#if 0 +#define CACHELINE 64 /* bytes, Xeon Phi KNL */ +#else +#define CACHELINE 0 /* not worry about false sharing of different threads */ +#endif +//#define GEMM_PADLEN 1 +#define GEMM_PADLEN 8 + +#define PSGSTRF2 psgstrf2_trsm + +#ifdef ISORT +extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); +extern void isort1 (int_t N, int_t * ARRAY); + +#else + +int +superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + + +/************************************************************************/ + +#include "sscatter.c" + +/************************************************************************/ + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t +psgstrf(superlu_dist_options_t * options, int m, int n, float anorm, + sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs = _cptofcd ("N", strlen ("N")); + _fcd ftcs1 = _cptofcd ("L", strlen ("L")); + _fcd ftcs2 = _cptofcd ("N", strlen ("N")); + _fcd ftcs3 = _cptofcd ("U", strlen ("U")); +#endif + float zero = 0.0, alpha = 1.0, beta = 0.0; + int_t *xsup; + int_t *lsub, *lsub1, *usub, *Usub_buf; + int_t **Lsub_buf_2, **Usub_buf_2; + float **Lval_buf_2, **Uval_buf_2; /* pointers to starts of bufs */ + float *lusup, *lusup1, *uval, *Uval_buf; /* pointer to current buf */ + int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, + lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, + nlb, nub, nsupc, rel, rukp, il, iu; + int_t Pc, Pr; + int iam, kcol, krow, yourcol, mycol, myrow, pi, pj; + int j, k, lk, nsupers; /* k - current panel to work on */ + int k0; /* counter of the next supernode to be factored */ + int kk, kk0, kk1, kk2, jj0; /* panels in the look-ahead window */ + int iukp0, rukp0, flag0, flag1; + int nsupr, nbrow, segsize; + int msg0, msg2; + int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; + float **Unzval_br_ptr, **Lnzval_bc_ptr; + int_t *index; + float *nzval; + float *ucol; + int *indirect, *indirect2; + int_t *tempi; + float *tempu, *tempv, *tempr; + /* float *tempv2d, *tempU2d; Sherry */ + int iinfo; + int *ToRecv, *ToSendD, **ToSendR; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + superlu_scope_t *scp; + float s_eps; + double thresh; + /*int full;*/ + int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks; + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, + *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l, + *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno; + float edag_supno_l_bytes; +#ifdef ISORT + int_t *iperm_u; +#endif + int *msgcnt; /* Count the size of the message xfer'd in each buffer: + * 0 : transferred in Lsub_buf[] + * 1 : transferred in Lval_buf[] + * 2 : transferred in Usub_buf[] + * 3 : transferred in Uval_buf[] + */ + int **msgcnts, **msgcntsU; /* counts in the look-ahead window */ + int *factored; /* factored[j] == 0 : L col panel j is factorized. */ + int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */ + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + etree_node *head, *tail, *ptr; + int *num_child; + int num_look_aheads, look_id; + int *look_ahead; /* global look_ahead table */ + int_t *perm_c_supno, *iperm_c_supno; + /* perm_c_supno[k] = j means at the k-th step of elimination, + * the j-th supernode is chosen. */ + MPI_Request *recv_req, **recv_reqs, **send_reqs, **send_reqs_u, + **recv_reqs_u; + MPI_Request *send_req, *U_diag_blk_send_req = NULL; + MPI_Status status; + void *attr_val; + int flag; + + /* The following variables are used to pad GEMM dimensions so that + each is a multiple of vector length (8 doubles for KNL) */ + int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN, + gemm_n_pad = GEMM_PADLEN; + int gemm_padding = 0; + + int iword = sizeof (int_t); + int dword = sizeof (float); + + /* For measuring load imbalence in omp threads */ + double omp_load_imblc = 0.0; + double *omp_loop_time; + + double schur_flop_timer = 0.0; + double pdgstrf2_timer = 0.0; + double pdgstrs2_timer = 0.0; + double lookaheadupdatetimer = 0.0; + double InitTimer = 0.0; /* including compute schedule, malloc */ + double tt_start, tt_end; + +/* #if !defined( GPU_ACC ) */ + /* Counters for memory operations and timings */ + double scatter_mem_op_counter = 0.0; + double scatter_mem_op_timer = 0.0; + double scatterL_mem_op_counter = 0.0; + double scatterL_mem_op_timer = 0.0; + double scatterU_mem_op_counter = 0.0; + double scatterU_mem_op_timer = 0.0; + + /* Counters for flops/gather/scatter and timings */ + double GatherLTimer = 0.0; + double LookAheadRowSepMOP = 0.0; + double GatherUTimer = 0.0; + double GatherMOP = 0.0; + double LookAheadGEMMTimer = 0.0; + double LookAheadGEMMFlOp = 0.0; + double LookAheadScatterTimer = 0.0; + double LookAheadScatterMOP = 0.0; + double RemainGEMMTimer = 0.0; + double RemainGEMM_flops = 0.0; + double RemainScatterTimer = 0.0; + double NetSchurUpTimer = 0.0; + double schur_flop_counter = 0.0; +/* #endif */ + +#if ( PRNTlevel>= 1) + /* count GEMM max dimensions */ + int gemm_max_m = 0, gemm_max_n = 0, gemm_max_k = 0; +#endif + +#if ( DEBUGlevel>=2 ) + int_t num_copy = 0, num_update = 0; +#endif +#if ( PRNTlevel==3 ) + int zero_msg = 0, total_msg = 0; +#endif +#if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + double comm_wait_time = 0.0; + /* Record GEMM dimensions and times */ + FILE *fopen(), *fgemm; + int gemm_count = 0; + typedef struct { + int m, n, k; + double microseconds; + } gemm_profile; + gemm_profile *gemm_stats; +#endif + + /* Test the input parameters. */ + *info = 0; + if (m < 0) + *info = -2; + else if (n < 0) + *info = -3; + if (*info) { + pxerr_dist ("psgstrf", grid, -*info); + return (-1); + } + + /* Quick return if possible. */ + if (m == 0 || n == 0) return 0; + + double tt1 = SuperLU_timer_ (); + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + mycol = MYCOL (iam, grid); + nsupers = Glu_persist->supno[n - 1] + 1; + xsup = Glu_persist->xsup; + s_eps = smach_dist("Epsilon"); + thresh = s_eps * anorm; + + MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag); + if (!flag) { + fprintf (stderr, "Could not get TAG_UB\n"); + return (-1); + } + int tag_ub = *(int *) attr_val; + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + +#if ( DEBUGlevel>=1 ) + if (s_eps == 0.0) + printf (" ***** warning s_eps = %e *****\n", s_eps); + CHECK_MALLOC (iam, "Enter psgstrf()"); +#endif +#if (PROFlevel >= 1 ) + gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile)); + if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w"); + int *prof_sendR = intCalloc_dist(nsupers); +#endif + + stat->ops[FACT] = 0.0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + + /* make sure the range of look-ahead window [0, MAX_LOOKAHEADS-1] */ + num_look_aheads = SUPERLU_MAX(0, SUPERLU_MIN(options->num_lookaheads, MAX_LOOKAHEADS - 1)); + + if (Pr * Pc > 1) { + if (!(U_diag_blk_send_req = + (MPI_Request *) SUPERLU_MALLOC (Pr * sizeof (MPI_Request)))) + ABORT ("Malloc fails for U_diag_blk_send_req[]."); + /* flag no outstanding Isend */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; /* used 0 before */ + + /* allocating buffers for look-ahead */ + i = Llu->bufmax[0]; + if (i != 0) { + if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) ) + ABORT ("Malloc fails for Lsub_buf."); + tempi = Llu->Lsub_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */ + //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i; + } + i = Llu->bufmax[1]; + if (i != 0) { + if (!(Llu->Lval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * ((size_t) i)))) + ABORT ("Malloc fails for Lval_buf[]."); + tempr = Llu->Lval_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */ + //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i; + } + i = Llu->bufmax[2]; + if (i != 0) { + if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i))) + ABORT ("Malloc fails for Usub_buf_2[]."); + tempi = Llu->Usub_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */ + //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i; + } + i = Llu->bufmax[3]; + if (i != 0) { + if (!(Llu->Uval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * i))) + ABORT ("Malloc fails for Uval_buf_2[]."); + tempr = Llu->Uval_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */ + //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i; + } + } + + log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) + * iword + + (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) + * dword, stat ); + + /* creating pointers to the look-ahead buffers */ + if (! (Lsub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *)))) + ABORT ("Malloc fails for Lsub_buf_2[]."); + if (! (Lval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *)))) + ABORT ("Malloc fails for Lval_buf_2[]."); + if (! (Usub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *)))) + ABORT ("Malloc fails for Uval_buf_2[]."); + if (! (Uval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *)))) + ABORT ("Malloc fails for buf_2[]."); + for (i = 0; i <= num_look_aheads; i++) { + Lval_buf_2[i] = Llu->Lval_buf_2[i]; + Lsub_buf_2[i] = Llu->Lsub_buf_2[i]; + Uval_buf_2[i] = Llu->Uval_buf_2[i]; + Usub_buf_2[i] = Llu->Usub_buf_2[i]; + } + + if (!(msgcnts = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *)))) + ABORT ("Malloc fails for msgcnts[]."); + if (!(msgcntsU = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *)))) + ABORT ("Malloc fails for msgcntsU[]."); + for (i = 0; i <= num_look_aheads; i++) { + if (!(msgcnts[i] = SUPERLU_MALLOC (4 * sizeof (int)))) + ABORT ("Malloc fails for msgcnts[]."); + if (!(msgcntsU[i] = SUPERLU_MALLOC (4 * sizeof (int)))) + ABORT ("Malloc fails for msgcntsU[]."); + } + + if (! (recv_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for recv_reqs_u[]."); + if (! (send_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for send_reqs_u[]."); + if (! (send_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for send_reqs_u[]."); + if (! (recv_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for recv_reqs[]."); + for (i = 0; i <= num_look_aheads; i++) { + if (!(recv_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * sizeof (MPI_Request)))) + ABORT ("Malloc fails for recv_req_u[i]."); + if (!(send_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pr * sizeof (MPI_Request)))) + ABORT ("Malloc fails for send_req_u[i]."); + if (!(send_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pc * sizeof (MPI_Request)))) + ABORT ("Malloc fails for send_reqs[i]."); + if (!(recv_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (4 * sizeof (MPI_Request)))) + ABORT ("Malloc fails for recv_req[]."); + send_reqs[i][0] = send_reqs[i][1] = MPI_REQUEST_NULL; + recv_reqs[i][0] = recv_reqs[i][1] = MPI_REQUEST_NULL; + } + + if (!(factored = SUPERLU_MALLOC (nsupers * sizeof (int_t)))) + ABORT ("Malloc fails for factored[]."); + if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t)))) + ABORT ("Malloc fails for factoredU[]."); + for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1; + + log_memory(2 * nsupers * iword, stat); + + int num_threads = 1; +#ifdef _OPENMP +#pragma omp parallel default(shared) + #pragma omp master + { + num_threads = omp_get_num_threads (); + } +#endif + +#if 0 + omp_loop_time = (double *) _mm_malloc (sizeof (double) * num_threads,64); +#else + omp_loop_time = (double *) SUPERLU_MALLOC(num_threads * sizeof(double)); +#endif + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf(".. Starting with %d OpenMP threads \n", num_threads ); + fflush(stdout); + } +#endif + + nblocks = 0; + ncb = nsupers / Pc; /* number of column blocks, horizontal */ + nrb = nsupers / Pr; /* number of row blocks, vertical */ + + /* in order to have dynamic scheduling */ + int *full_u_cols; + int *blk_ldu; +#if 0 + full_u_cols = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64); + blk_ldu = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64); +#else + full_u_cols = SUPERLU_MALLOC((ncb+1) * sizeof(int)); + blk_ldu = SUPERLU_MALLOC((ncb+1) * sizeof(int)); // +1 to accommodate un-even division +#endif + + log_memory(2 * ncb * iword, stat); + +#if 0 /* Sherry: not used? */ + /* This bunch is used for static scheduling */ + pair *full_col_count = (pair *) _mm_malloc (sizeof (pair) * ncb,64); + int_t *count_cols, *sum_cols, *partition; + count_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64); + sum_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64); + partition = (int_t *) _mm_malloc (sizeof (int_t) * num_threads * ncb,64); + int_t ldp = ncb; +#endif + + /* ################################################################## + * Compute a good static schedule based on the factorization task graph. + * ################################################################## */ + perm_c_supno = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t)); + iperm_c_supno = perm_c_supno + nsupers; + + sstatic_schedule(options, m, n, LUstruct, grid, stat, + perm_c_supno, iperm_c_supno, info); + +#if ( DEBUGlevel >= 2 ) + PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno); + + /* Turn off static schedule */ + printf("[%d] .. Turn off static schedule for debugging ..\n", iam); + for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i; +#endif + /* ################################################################## */ + + /* constructing look-ahead table to indicate the last dependency */ + int *look_ahead_l; /* Sherry: add comment on look_ahead_l[] */ + stat->num_look_aheads = num_look_aheads; + + look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int)); + look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */ + log_memory(3 * nsupers * iword, stat); + + /* Sherry: omp parallel? + not worth doing, due to concurrent write to look_ahead_l[jb] */ + for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */ + ib = lb * Pr + myrow; + index = Llu->Ufstnz_br_ptr[lb]; + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + jb = index[k]; /* global block number */ + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (myrow < nsupers % grid->nprow) { /* leftover block rows */ + ib = nrb * Pr + myrow; + index = Llu->Ufstnz_br_ptr[nrb]; + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (options->SymPattern == NO) { + /* Sherry: omp parallel? + not worth doing, due to concurrent write to look_ahead_l[jb] */ + for (lb = 0; lb < ncb; lb++) { /* go through L-factor */ + ib = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { + k = BC_HEADER; + for (j = 0; j < index[0]; j++) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) { /* leftover block columns */ + ib = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { + k = BC_HEADER; + for (j = 0; j < index[0]; j++) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + } + MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, grid->comm); + SUPERLU_FREE (look_ahead_l); + +#ifdef ISORT + iperm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + perm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t)); +#else + perm_u = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t)); +#endif + log_memory(nsupers * iword, stat); + + k = sp_ienv_dist (3); /* max supernode size */ +#if 0 + if ( !(Llu->ujrow = floatMalloc_dist(k*(k+1)/2)) ) + ABORT("Malloc fails for ujrow[]."); +#else + /* Instead of half storage, we'll do full storage */ + if (!(Llu->ujrow = floatCalloc_dist (k * k))) + ABORT ("Malloc fails for ujrow[]."); +#endif + log_memory(k * k * iword, stat); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, + thresh); + printf + (".. Buffer size: Lsub %ld\tLval %ld\tUsub %ld\tUval %ld\tLDA %ld\n", + (long int) Llu->bufmax[0], (long int) Llu->bufmax[1], + (long int) Llu->bufmax[2], (long int) Llu->bufmax[3], + (long int) Llu->bufmax[4]); + fflush(stdout); + } +#endif + + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + ToRecv = Llu->ToRecv; + ToSendD = Llu->ToSendD; + ToSendR = Llu->ToSendR; + + ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + k = CEILING (nsupers, Pr); /* Number of local block rows */ + + /* Following code is for finding maximum row dimension of all L panels */ + int local_max_row_size = 0; + int max_row_size; + +#if 0 +#if defined _OPENMP // Sherry: parallel reduction -- seems slower? +#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) +#endif +#endif + for (i = mycol; i < nsupers; i += Pc) { /* grab my local columns */ + //int tpc = PCOL (i, grid); + lk = LBj (i, grid); + lsub = Lrowind_bc_ptr[lk]; + if (lsub != NULL) { + if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1]; + } + + } + + /* Max row size is global reduction within a row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + /* Buffer size is max of look-ahead window */ + /* int_t buffer_size = + SUPERLU_MAX (max_row_size * num_threads * ldt, + get_max_buffer_size ()); */ + +#ifdef GPU_ACC /*-------- use GPU --------*/ + int gpublas_nb = get_gpublas_nb(); // default 64 + int nstreams = get_num_gpu_streams (); // default 8 + + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8)); + // get_max_buffer_size()); + /* array holding last column blk for each partition, + used in SchCompUdt-cuda.c */ + #if 0 + int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64); + #else + int *stream_end_col = SUPERLU_MALLOC( nstreams * sizeof(int) ); + #endif + +#else /* not to use GPU */ + + int Threads_per_process = get_thread_per_process(); + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8)); + // get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ + + int_t max_ncols = 0; +#if 0 + /* symmetric assumption -- using L's supernode to estimate. */ + /* Note that in following expression 8 can be anything + as long as its not too big */ + int bigu_size = 8 * sp_ienv_dist (3) * (max_row_size); +#else + int_t bigu_size = estimate_bigu_size( nsupers, Ufstnz_br_ptr, Glu_persist, + grid, perm_u, &max_ncols ); +#endif + + /* +16 to avoid cache line false sharing */ + // int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt), + int_t bigv_size = SUPERLU_MAX(max_row_size * max_ncols, + (ldt*ldt + CACHELINE / dword) * num_threads); + + /* bigU and bigV are only allocated on CPU, but may be allocated as + page-locked memory accessible to GPU. */ + float* bigU; /* for storing entire U(k,:) panel, prepare for GEMM. */ + float* bigV; /* for storing GEMM output matrix, i.e. update matrix. + bigV is large enough to hold the aggregate GEMM output.*/ + bigU = NULL; /* allocated only on CPU */ + bigV = NULL; + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } + printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); + fflush(stdout); +#endif + +#ifdef GPU_ACC /*-- use GPU --*/ + + if ( checkGPU(gpuHostAlloc((void**)&bigU, bigu_size * sizeof(float), gpuHostAllocDefault)) ) + ABORT("Malloc fails for sgemm buffer U "); + +#if 0 // !!Sherry fix -- only dC on GPU uses buffer_size + bigv_size = buffer_size; +#endif + +#if ( PRNTlevel>=1 ) + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); + fflush(stdout); +#endif + + if ( checkGPU(gpuHostAlloc((void**)&bigV, bigv_size * sizeof(float) ,gpuHostAllocDefault)) ) + ABORT("Malloc fails for sgemm buffer V"); + +#if ( PRNTlevel>=1 ) + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d gpu streams \n",nstreams ); + fflush(stdout); + } +#endif + + gpublasHandle_t *handle; + handle = (gpublasHandle_t *) SUPERLU_MALLOC(sizeof(gpublasHandle_t)*nstreams); + for(int i = 0; i < nstreams; i++) handle[i] = create_handle(); + + // creating streams + gpuStream_t *streams; + streams = (gpuStream_t *) SUPERLU_MALLOC(sizeof(gpuStream_t)*nstreams); + for (int i = 0; i < nstreams; ++i) + checkGPU( gpuStreamCreate(&streams[i]) ); + + // allocating data in device + float *dA, *dB, *dC; + gpuError_t gpuStat; +#if 0 + // gpuStat = gpuMalloc( (void**)&dA, m*k*sizeof(double)); + // HOw much should be the size of dA? + // for time being just making it + // gpuStat = gpuMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double)); +#endif + + gpuStat = gpuMalloc( (void**)&dA, max_row_size*sp_ienv_dist(3)* sizeof(float)); + if (gpuStat!= gpuSuccess) { + fprintf(stderr, "!!!! Error in allocating A in the device %ld \n",m*k*sizeof(float) ); + return 1; + } + + // size of B should be bigu_size + gpuStat = gpuMalloc((void**)&dB, bigu_size * sizeof(float)); + if (gpuStat!= gpuSuccess) { + fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(float)); + return 1; + } + + gpuStat = gpuMalloc((void**)&dC, buffer_size * sizeof(float) ); + if (gpuStat!= gpuSuccess) { + fprintf(stderr, "!!!! Error in allocating C in the device \n" ); + return 1; + } + + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC + +#else /*-------- not to use GPU --------*/ + + // for GEMM padding 0 + j = bigu_size / ldt; + bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad)); + bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad)); + +#if ( PRNTlevel>=1 ) + printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size); + fflush(stdout); +#endif + +//#ifdef __INTEL_COMPILER +// bigU = _mm_malloc(bigu_size * sizeof(float), 1<<12); // align at 4K page +// bigV = _mm_malloc(bigv_size * sizeof(float), 1<<12); +//#else + if ( !(bigU = floatMalloc_dist(bigu_size)) ) + ABORT ("Malloc fails for sgemm U buffer"); + if ( !(bigV = floatMalloc_dist(bigv_size)) ) + ABORT ("Malloc failed for sgemm V buffer"); +//#endif + +#endif +/*************** end ifdef GPU_ACC ****************/ + + log_memory((bigv_size + bigu_size) * dword, stat); + + // mlock(bigU,(bigu_size) * sizeof (double)); + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf (" Max row size is %d \n", max_row_size); + printf (" Threads per process %d \n", num_threads); + fflush(stdout); + } + +#endif + + /* Sherry: (ldt + 16), avoid cache line false sharing. + KNL cacheline size = 64 bytes = 16 int */ + iinfo = ldt + CACHELINE / sizeof(int); + if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int)))) + ABORT ("Malloc fails for indirect[]."); + if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int)))) + ABORT ("Malloc fails for indirect[]."); + + log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword, stat); + + int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib, + *RemainStRow,*Remain_lptr,*Remain_ib; + + lookAheadFullRow = intMalloc_dist( (num_look_aheads+1) ); + lookAheadStRow = intMalloc_dist( (num_look_aheads+1) ); + lookAhead_lptr = intMalloc_dist( (num_look_aheads+1) ); + lookAhead_ib = intMalloc_dist( (num_look_aheads+1) ); + + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + + RemainStRow = intMalloc_dist(mrb); +#if 0 + Remain_lptr = (int *) _mm_malloc(sizeof(int)*mrb,1); +#else + Remain_lptr = intMalloc_dist(mrb); +#endif + // mlock(Remain_lptr, sizeof(int)*mrb ); + Remain_ib = intMalloc_dist(mrb); + + Remain_info_t *Remain_info; +#if 0 + Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64); +#else + Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t)); +#endif + + float *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */ + Ublock_info_t *Ublock_info; + ldt = sp_ienv_dist (3); /* max supernode size */ + /* The following is quite loose */ + lookAhead_L_buff = floatMalloc_dist(ldt*ldt* (num_look_aheads+1) ); + +#if 0 + Remain_L_buff = (float *) _mm_malloc( sizeof(float)*(Llu->bufmax[1]),64); + Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64); + /*int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); */ +#else + j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad); + Remain_L_buff = floatMalloc_dist(Llu->bufmax[1] + j); /* This is loose */ + Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t)); + /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ +#endif + + long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t) + + ldt * ldt * (num_look_aheads+1) * dword + + Llu->bufmax[1] * dword ; + log_memory(alloc_mem, stat); + + InitTimer = SuperLU_timer_() - tt1; + + double pxgstrfTimer = SuperLU_timer_(); + + /* ################################################################## + ** Handle first block column separately to start the pipeline. ** + ################################################################## */ + look_id = 0; + msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */ + send_req = send_reqs[0]; + recv_req = recv_reqs[0]; + + k0 = 0; + k = perm_c_supno[0]; + kcol = PCOL (k, grid); + krow = PROW (k, grid); + if (mycol == kcol) { + double ttt1 = SuperLU_timer_(); + + /* panel factorization */ + PSGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu, + U_diag_blk_send_req, tag_ub, stat, info); + + pdgstrf2_timer += SuperLU_timer_()-ttt1; + + scp = &grid->rscp; /* The scope of process row. */ + + /* Multicasts numeric values of L(:,0) to process rows. */ + lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if (lsub) { + /* number of entries in Lsub_buf[] to be transferred */ + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + /* number of entries in Lval_buf[] to be transferred */ + msgcnt[1] = lsub[1] * SuperSize (k); + } else { + msgcnt[0] = msgcnt[1] = 0; + } + + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, 0) /* 0 */, + scp->comm, &send_req[pj]); + MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, 0) /* 1 */, + scp->comm, &send_req[pj + Pc]); +#if ( DEBUGlevel>=2 ) + printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", + iam, 0, msgcnt[0], msgcnt[1], pj); +#endif + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; + msg_cnt += 2; + msg_vol += msgcnt[0] * iword + msgcnt[1] * dword; +#endif + } /* end if */ + } /* end for pj ... */ + } else { /* Post immediate receives. */ + if (ToRecv[k] >= 1) { /* Recv block column L(:,0). */ + scp = &grid->rscp; /* The scope of process row. */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol, + SLU_MPI_TAG (0, 0) /* 0 */ , + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_FLOAT, kcol, + SLU_MPI_TAG (1, 0) /* 1 */ , + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + } /* end if mycol == 0 */ + + factored[k] = 0; /* flag column k as factored. */ + + /* post receive of first U-row */ + if (myrow != krow) { + if (ToRecv[k] == 2) { /* Recv block row U(k,:). */ + scp = &grid->cscp; /* The scope of process column. */ + Usub_buf = Llu->Usub_buf_2[0]; + Uval_buf = Llu->Uval_buf_2[0]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, 0) /* 2%tag_ub */ , + scp->comm, &recv_reqs_u[0][0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow, + SLU_MPI_TAG (3, 0) /* 3%tag_ub */ , + scp->comm, &recv_reqs_u[0][1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + } + + /* ################################################################## + **** MAIN LOOP **** + ################################################################## */ + for (k0 = 0; k0 < nsupers; ++k0) { + k = perm_c_supno[k0]; + + /* ============================================ * + * ======= look-ahead the new L columns ======= * + * ============================================ */ + /* tt1 = SuperLU_timer_(); */ + if (k0 == 0) { /* look-ahead all the columns in the window */ + kk1 = k0 + 1; + kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + } else { /* look-ahead one new column after the current window */ + kk1 = k0 + num_look_aheads; + kk2 = SUPERLU_MIN (kk1, nsupers - 1); + } + + for (kk0 = kk1; kk0 <= kk2; kk0++) { + /* loop through look-ahead window in L */ + + kk = perm_c_supno[kk0]; /* use the ordering from static schedule */ + look_id = kk0 % (1 + num_look_aheads); /* which column in window */ + + if (look_ahead[kk] < k0) { /* does not depend on current column k */ + kcol = PCOL (kk, grid); + if (mycol == kcol) { /* I own this panel */ + + /* Panel factorization -- Factor diagonal and subdiagonal + L blocks and test for exact singularity. */ + factored[kk] = 0; /* flag column kk as factored */ + double ttt1 = SuperLU_timer_(); + + PSGSTRF2 (options, kk0, kk, thresh, Glu_persist, + grid, Llu, U_diag_blk_send_req, tag_ub, stat, info); + + pdgstrf2_timer += SuperLU_timer_() - ttt1; + + /* Multicasts numeric values of L(:,kk) to process rows. */ + /* ttt1 = SuperLU_timer_(); */ + msgcnt = msgcnts[look_id]; /* point to the proper count array */ + send_req = send_reqs[look_id]; + + lk = LBj (kk, grid); /* Local block number in L. */ + lsub1 = Lrowind_bc_ptr[lk]; + if (lsub1) { + msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */ + msgcnt[1] = lsub1[1] * SuperSize (kk); /* Lval_buf[] size */ + } else { + msgcnt[0] = 0; + msgcnt[1] = 0; + } + scp = &grid->rscp; /* The scope of process row. */ + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { + lusup1 = Lnzval_bc_ptr[lk]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &send_req[pj]); + MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &send_req[pj + Pc]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n", + iam, kk, msgcnt[0], msgcnt[1], pj); +#endif + } + } + /* stat->time9 += SuperLU_timer_() - ttt1; */ + } else { /* Post Recv of block column L(:,kk). */ + /* double ttt1 = SuperLU_timer_(); */ + if (ToRecv[kk] >= 1) { + scp = &grid->rscp; /* The scope of process row. */ + recv_req = recv_reqs[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0], + mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1], + MPI_FLOAT, kcol, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + /* stat->time10 += SuperLU_timer_() - ttt1; */ + } /* end if mycol == Pc(kk) */ + } /* end if look-ahead in L panels */ + + /* Pre-post irecv for U-row look-ahead */ + krow = PROW (kk, grid); + if (myrow != krow) { + if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */ + scp = &grid->cscp; /* The scope of process column. */ + Usub_buf = Llu->Usub_buf_2[look_id]; + Uval_buf = Llu->Uval_buf_2[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ , + scp->comm, &recv_reqs_u[look_id][0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow, + SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ , + scp->comm, &recv_reqs_u[look_id][1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + } + + } /* end for each column in look-ahead window for L panels */ + + /* stat->time4 += SuperLU_timer_()-tt1; */ + + /* ================================= * + * ==== look-ahead the U rows === * + * ================================= */ + kk1 = k0; + kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + for (kk0 = kk1; kk0 < kk2; kk0++) { + kk = perm_c_supno[kk0]; /* order determined from static schedule */ + if (factoredU[kk0] != 1 && look_ahead[kk] < k0) { + /* does not depend on current column k */ + kcol = PCOL (kk, grid); + krow = PROW (kk, grid); + lk = LBj (kk, grid); /* Local block number across row. NOT USED?? -- Sherry */ + + look_id = kk0 % (1 + num_look_aheads); + msgcnt = msgcntsU[look_id]; + recv_req = recv_reqs[look_id]; + + /* ================================================= * + * Check if diagonal block has been received * + * for panel factorization of U in look-ahead window * + * ================================================= */ + + if (mycol == kcol) { /* I own this column panel, no need + to receive L */ + flag0 = flag1 = 1; + msgcnt[0] = msgcnt[1] = -1; /* No need to transfer Lsub, nor Lval */ + } else { /* Check to receive L(:,kk) from the left */ + flag0 = flag1 = 0; + if ( ToRecv[kk] >= 1 ) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + if ( recv_req[0] != MPI_REQUEST_NULL ) { + MPI_Test (&recv_req[0], &flag0, &status); + if ( flag0 ) { + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + } else flag0 = 1; + + if ( recv_req[1] != MPI_REQUEST_NULL ) { + MPI_Test (&recv_req[1], &flag1, &status); + if ( flag1 ) { + MPI_Get_count (&status, mpi_int_t, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + } else flag1 = 1; +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } else { + msgcnt[0] = 0; + } + } + + if (flag0 && flag1) { /* L(:,kk) is ready */ + /* tt1 = SuperLU_timer_(); */ + scp = &grid->cscp; /* The scope of process column. */ + if (myrow == krow) { + factoredU[kk0] = 1; + /* Parallel triangular solve across process row *krow* -- + U(k,j) = L(k,k) \ A(k,j). */ + double ttt2 = SuperLU_timer_(); +#ifdef _OPENMP +/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */ +#endif + { + psgstrs2_omp (kk0, kk, Glu_persist, grid, Llu, + Ublock_info, stat); + } + + pdgstrs2_timer += SuperLU_timer_()-ttt2; + /* stat->time8 += SuperLU_timer_()-ttt2; */ + + /* Multicasts U(kk,:) to process columns. */ + lk = LBi (kk, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + if (usub) { + msgcnt[2] = usub[2]; /* metadata size */ + msgcnt[3] = usub[1]; /* Uval[] size */ + } else { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) { + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + MPI_Isend (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, kk0), /* (4*kk0+2)%tag_ub */ + scp->comm, &send_reqs_u[look_id][pi]); + MPI_Isend (uval, msgcnt[3], MPI_FLOAT, + pi, SLU_MPI_TAG (3, kk0), /* (4*kk0+3)%tag_ub */ + scp->comm, &send_reqs_u[look_id][pi + Pr]); + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + msg_cnt += 2; + msg_vol += msgcnt[2] * iword + msgcnt[3] * dword; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] Send U(%4d,:) to Pr %2d\n", + iam, k, pi); +#endif + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + + /* stat->time2 += SuperLU_timer_()-tt1; */ + + } /* end if myrow == krow */ + } /* end if flag0 & flag1 ... */ + } /* end if factoredU[] ... */ + } /* end for kk0 ... */ + + /* ============================================== * + * == start processing the current row of U(k,:) * + * ============================================== */ + knsupc = SuperSize (k); + krow = PROW (k, grid); + kcol = PCOL (k, grid); + + /* tt1 = SuperLU_timer_(); */ + look_id = k0 % (1 + num_look_aheads); + recv_req = recv_reqs[look_id]; + send_req = send_reqs[look_id]; + msgcnt = msgcnts[look_id]; + Usub_buf = Llu->Usub_buf_2[look_id]; + Uval_buf = Llu->Uval_buf_2[look_id]; + + if (mycol == kcol) { + lk = LBj (k, grid); /* Local block number in L */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (pj = 0; pj < Pc; ++pj) { + /* Wait for Isend to complete before using lsub/lusup buffer. */ + if (ToSendR[lk][pj] != EMPTY) { + MPI_Wait (&send_req[pj], &status); + MPI_Wait (&send_req[pj + Pc], &status); + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + } else { + if (ToRecv[k] >= 1) { /* Recv block column L(:,k). */ + + scp = &grid->rscp; /* The scope of process row. */ + + /* ============================================= * + * Waiting for L(:,kk) for outer-product uptate * + * if iam in U(kk,:), then the diagonal block * + * did not reach in time for panel factorization * + * of U(k,:). * + * ============================================= */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + if (recv_req[0] != MPI_REQUEST_NULL) { + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } else { + msgcnt[0] = msgcntsU[look_id][0]; +#if (DEBUGlevel>=2) + printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", + iam, k, look_id, msgcnt[0]); +#endif + } + + if (recv_req[1] != MPI_REQUEST_NULL) { + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } else { + msgcnt[1] = msgcntsU[look_id][1]; +#if (DEBUGlevel>=2) + printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", + iam, k, look_id, msgcnt[1]); +#endif + } + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif +#if ( DEBUGlevel>=2 ) + printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n", + iam, k, msgcnt[0], msgcnt[1], kcol); + fflush (stdout); +#endif + +#if ( PRNTlevel==3 ) + ++total_msg; + if (!msgcnt[0]) ++zero_msg; +#endif + } else { + msgcnt[0] = 0; + } + + lsub = Lsub_buf_2[look_id]; + lusup = Lval_buf_2[look_id]; + } /* else if mycol = Pc(k) */ + /* stat->time1 += SuperLU_timer_()-tt1; */ + + scp = &grid->cscp; /* The scope of process column. */ + + /* tt1 = SuperLU_timer_(); */ + if (myrow == krow) { /* I own U(k,:) */ + lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + + if (factoredU[k0] == -1) { + /* Parallel triangular solve across process row *krow* -- + U(k,j) = L(k,k) \ A(k,j). */ + double ttt2 = SuperLU_timer_(); +#ifdef _OPENMP +/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */ +#endif + { + psgstrs2_omp (k0, k, Glu_persist, grid, Llu, + Ublock_info, stat); + } + pdgstrs2_timer += SuperLU_timer_() - ttt2; + + /* Sherry -- need to set factoredU[k0] = 1; ?? */ + + /* Multicasts U(k,:) along process columns. */ + if ( usub ) { + msgcnt[2] = usub[2]; /* metadata size */ + msgcnt[3] = usub[1]; /* Uval[] size */ + } else { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) { + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { /* Matching recv was pre-posted before */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Send (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */ + scp->comm); + MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi, + SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ + scp->comm); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; + msg_cnt += 2; + msg_vol += msgcnt[2] * iword + msgcnt[3] * dword; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] Send U(%4d,:) down to Pr %2d\n", iam, k, pi); +#endif + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + + } else { /* Panel U(k,:) already factorized from previous look-ahead */ + + /* ================================================ * + * Wait for downward sending of U(k,:) to complete * + * for outer-product update. * + * ================================================ */ + + if (ToSendD[lk] == YES) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { + MPI_Wait (&send_reqs_u[look_id][pi], &status); + MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status); + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + msgcnt[2] = msgcntsU[look_id][2]; + msgcnt[3] = msgcntsU[look_id][3]; + } + /* stat->time2 += SuperLU_timer_()-tt1; */ + + } else { /* myrow != krow */ + + /* ========================================== * + * Wait for U(k,:) for outer-product updates. * + * ========================================== */ + + if (ToRecv[k] == 2) { /* Recv block row U(k,:). */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Wait (&recv_reqs_u[look_id][0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[2]); + MPI_Wait (&recv_reqs_u[look_id][1], &status); + MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]); + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + usub = Usub_buf; + uval = Uval_buf; +#if ( DEBUGlevel>=2 ) + printf ("[%d] Recv U(%4d,:) from Pr %2d\n", iam, k, krow); +#endif +#if ( PRNTlevel==3 ) + ++total_msg; + if (!msgcnt[2]) ++zero_msg; +#endif + } else { + msgcnt[2] = 0; + } + /* stat->time6 += SuperLU_timer_()-tt1; */ + } /* end if myrow == Pr(k) */ + + /* + * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). + * for (j = k+1; k < N; ++k) { + * for (i = k+1; i < N; ++i) + * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) + * && L(i,k) != 0 && U(k,j) != 0 ) + * A(i,j) = A(i,j) - L(i,k) * U(k,j); + */ + msg0 = msgcnt[0]; + msg2 = msgcnt[2]; + /* tt1 = SuperLU_timer_(); */ + if (msg0 && msg2) { /* L(:,k) and U(k,:) are not empty. */ + nsupr = lsub[1]; /* LDA of lusup. */ + if (myrow == krow) { /* Skip diagonal block L(k,k). */ + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + } else { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + } + iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + rukp = 0; /* Pointer to nzval[] of U(k,:) */ + nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + klst = FstBlockC (k + 1); + + /* ------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead) + ------------------------------------------------------------- */ + iukp0 = iukp; + rukp0 = rukp; + /* reorder the remaining columns in bottome-up */ + /* TAU_STATIC_TIMER_START("LOOK_AHEAD_UPDATE"); */ + for (jj = 0; jj < nub; jj++) { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + jb = usub[iukp]; /* Global block number of block U(k,j). */ + nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + /* iperm_u is sorted based on elimination order; + perm_u reorders the U blocks to match the elimination order. */ + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + +/************************************************************************/ + double ttx =SuperLU_timer_(); + +//#include "slook_ahead_update_v4.c" +#include "slook_ahead_update.c" + + lookaheadupdatetimer += SuperLU_timer_() - ttx; +/************************************************************************/ + + /*ifdef OMP_LOOK_AHEAD */ + /* TAU_STATIC_TIMER_STOP("LOOK_AHEAD_UPDATE"); */ + } /* if L(:,k) and U(k,:) not empty */ + + /* stat->time3 += SuperLU_timer_()-tt1; */ + + /* ================== */ + /* == post receive == */ + /* ================== */ + kk1 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + for (kk0 = k0 + 1; kk0 <= kk1; kk0++) { + kk = perm_c_supno[kk0]; + kcol = PCOL (kk, grid); + + if (look_ahead[kk] == k0) { + if (mycol != kcol) { + if (ToRecv[kk] >= 1) { + scp = &grid->rscp; /* The scope of process row. */ + + look_id = kk0 % (1 + num_look_aheads); + recv_req = recv_reqs[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0], + mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1], + MPI_FLOAT, kcol, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + } else { + lk = LBj (kk, grid); /* Local block number. */ + lsub1 = Lrowind_bc_ptr[lk]; + lusup1 = Lnzval_bc_ptr[lk]; + if (factored[kk] == -1) { + /* Factor diagonal and subdiagonal blocks and + test for exact singularity. */ + factored[kk] = 0; /* flag column kk as factored */ + double ttt1 = SuperLU_timer_(); + PSGSTRF2 (options, kk0, kk, thresh, + Glu_persist, grid, Llu, U_diag_blk_send_req, + tag_ub, stat, info); + pdgstrf2_timer += SuperLU_timer_() - ttt1; + + /* Process column *kcol+1* multicasts numeric + values of L(:,k+1) to process rows. */ + look_id = kk0 % (1 + num_look_aheads); + send_req = send_reqs[look_id]; + msgcnt = msgcnts[look_id]; + + if (lsub1) { + msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub1[1] * SuperSize (kk); + } else { + msgcnt[0] = 0; + msgcnt[1] = 0; + } + + scp = &grid->rscp; /* The scope of process row. */ + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &send_req[pj]); + MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &send_req[pj + Pc]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; +#endif + } + } /* end for pj ... */ + } /* if factored[kk] ... */ + } + } + } + + double tsch = SuperLU_timer_(); + + /*******************************************************************/ + +#ifdef GPU_ACC /*-- GPU --*/ + +#include "sSchCompUdt-cuda.c" + +#else + +/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/ +//#include "sSchCompUdt-2Ddynamic_v6.c" + +#include "sSchCompUdt-2Ddynamic.c" + +#endif + /*uncomment following to compare against SuperLU 3.3 baseline*/ + /* #include "SchCompUdt--baseline.c" */ + /************************************************************************/ + + NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* MAIN LOOP for k0 = 0, ... */ + + /* ################################################################## + ** END MAIN LOOP: for k0 = ... + ################################################################## */ + + pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer; + +#if ( PRNTlevel>=2 ) + /* Print detailed statistics */ + /* Updating total flops */ + double allflops; + MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, + 0, grid->comm); + if ( iam==0 ) { + printf("\nInitialization time\t%8.4lf seconds\n" + "\t Serial: compute static schedule, allocate storage\n", InitTimer); + printf("\n==== Time breakdown in factorization (rank 0) ====\n"); + printf("Panel factorization \t %8.4lf seconds\n", + pdgstrf2_timer + pdgstrs2_timer); + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", + LookAheadGEMMTimer + RemainGEMMTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", + RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); + printf(".. Time to Scatter %8.4lf \n", + LookAheadScatterTimer + RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); + + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); + printf("--------\n"); + printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); + } +#endif + +#if ( DEBUGlevel>=3 ) + for (i = 0; i < Pr * Pc; ++i) { + if (iam == i) { + sPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); + sPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); + printf ("(%d)\n", iam); + PrintInt10 ("Recv", nsupers, Llu->ToRecv); + } + MPI_Barrier (grid->comm); + } +#endif + + /******************************************************** + * Free memory * + ********************************************************/ + + if (Pr * Pc > 1) { + SUPERLU_FREE (Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ + SUPERLU_FREE (Lval_buf_2[0]); /* also free Lval_buf_2[1] */ + if (Llu->bufmax[2] != 0) + SUPERLU_FREE (Usub_buf_2[0]); + if (Llu->bufmax[3] != 0) + SUPERLU_FREE (Uval_buf_2[0]); + if (U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) { + /* wait for last Isend requests to complete, deallocate objects */ + for (krow = 0; krow < Pr; ++krow) { + if (krow != myrow) + MPI_Wait (U_diag_blk_send_req + krow, &status); + } + } + SUPERLU_FREE (U_diag_blk_send_req); + } + + log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword + + (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword), + stat ); + + SUPERLU_FREE (Lsub_buf_2); + SUPERLU_FREE (Lval_buf_2); + SUPERLU_FREE (Usub_buf_2); + SUPERLU_FREE (Uval_buf_2); + SUPERLU_FREE (perm_c_supno); + SUPERLU_FREE (perm_u); +#ifdef ISORT + SUPERLU_FREE (iperm_u); +#endif + SUPERLU_FREE (look_ahead); + SUPERLU_FREE (factoredU); + SUPERLU_FREE (factored); + log_memory(-(6 * nsupers * iword), stat); + + for (i = 0; i <= num_look_aheads; i++) { + SUPERLU_FREE (msgcnts[i]); + SUPERLU_FREE (msgcntsU[i]); + } + SUPERLU_FREE (msgcnts); + SUPERLU_FREE (msgcntsU); + + for (i = 0; i <= num_look_aheads; i++) { + SUPERLU_FREE (send_reqs_u[i]); + SUPERLU_FREE (recv_reqs_u[i]); + SUPERLU_FREE (send_reqs[i]); + SUPERLU_FREE (recv_reqs[i]); + } + + SUPERLU_FREE (recv_reqs_u); + SUPERLU_FREE (send_reqs_u); + SUPERLU_FREE (recv_reqs); + SUPERLU_FREE (send_reqs); + +#ifdef GPU_ACC + checkGPU (gpuFreeHost (bigV)); + checkGPU (gpuFreeHost (bigU)); + gpuFree( (void*)dA ); /* Sherry added */ + gpuFree( (void*)dB ); + gpuFree( (void*)dC ); + SUPERLU_FREE( handle ); + SUPERLU_FREE( streams ); + SUPERLU_FREE( stream_end_col ); +#else +// #ifdef __INTEL_COMPILER +// _mm_free (bigU); +// _mm_free (bigV); +// #else + SUPERLU_FREE (bigV); + SUPERLU_FREE (bigU); +// #endif + /* Decrement freed memory from memory stat. */ + log_memory(-(bigv_size + bigu_size) * dword, stat); +#endif + + SUPERLU_FREE (Llu->ujrow); + // SUPERLU_FREE (tempv2d);/* Sherry */ + SUPERLU_FREE (indirect); + SUPERLU_FREE (indirect2); /* Sherry added */ + + ldt = sp_ienv_dist(3); + log_memory( -(3 * ldt *ldt * dword + 2 * ldt * num_threads * iword), stat ); + + /* Sherry added */ + SUPERLU_FREE(omp_loop_time); + SUPERLU_FREE(full_u_cols); + SUPERLU_FREE(blk_ldu); +#if ( PRNTlevel>=1 ) + log_memory(-2 * ncb * dword, stat); +#endif + + SUPERLU_FREE(lookAheadFullRow); + SUPERLU_FREE(lookAheadStRow); + SUPERLU_FREE(lookAhead_lptr); + SUPERLU_FREE(lookAhead_ib); + + SUPERLU_FREE(RemainStRow); + SUPERLU_FREE(Remain_lptr); + SUPERLU_FREE(Remain_ib); + SUPERLU_FREE(Remain_info); + SUPERLU_FREE(lookAhead_L_buff); + SUPERLU_FREE(Remain_L_buff); + log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) + + ldt * ldt * (num_look_aheads + 1) * dword + + Llu->bufmax[1] * dword), stat ); + + SUPERLU_FREE(Ublock_info); + /*SUPERLU_FREE(Ublock_info_iukp); + SUPERLU_FREE(Ublock_info_rukp); + SUPERLU_FREE(Ublock_info_jb); */ + + +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + /* Prepare error message - find the smallesr index i that U(i,i)==0 */ + if ( *info == 0 ) *info = n + 1; + MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm); + if ( iinfo == n + 1 ) *info = 0; + else *info = iinfo; + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + { + float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; + + MPI_Reduce (&msg_cnt, &msg_cnt_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_cnt, &msg_cnt_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if ( iam==0 ) { + printf ("\tPSGSTRF comm stat:" + "\tAvg\tMax\t\tAvg\tMax\n" + "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", + msg_cnt_sum / Pr / Pc, msg_cnt_max, + msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); + printf("\t\tcomm time on task 0: %8.2lf\n" + "\t\t\tcomm down DIAG block %8.2lf\n" + "\t\t\tcomm right L panel %8.2lf\n" + "\t\t\tcomm down U panel %8.2lf\n", + stat->utime[COMM], stat->utime[COMM_DIAG], + stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]); + //#include + //int Digs = DECIMAL_DIG; + printf("gemm_count %d\n", gemm_count); + for (i = 0; i < gemm_count; ++i) + fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n, + gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]); + + fclose(fgemm); + } + SUPERLU_FREE(gemm_stats); + SUPERLU_FREE(prof_sendR); + } +#endif + +#if ( PRNTlevel==3 ) + MPI_Allreduce (&zero_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm); + if (!iam) + printf (".. # msg of zero size\t%d\n", iinfo); + MPI_Allreduce (&total_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm); + if (!iam) + printf (".. # total msg\t%d\n", iinfo); +#endif + +#if ( DEBUGlevel>=3 ) + for (i = 0; i < Pr * Pc; ++i) { + if (iam == i) { + sPrintLblocks (iam, nsupers, grid, Glu_persist, Llu); + sPrintUblocks (iam, nsupers, grid, Glu_persist, Llu); + printf ("(%d)\n", iam); + PrintInt10 ("Recv", nsupers, Llu->ToRecv); + } + MPI_Barrier (grid->comm); + } +#endif + +#if ( DEBUGlevel>=3 ) + printf ("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit psgstrf()"); +#endif + + return 0; +} /* PSGSTRF */ + diff --git a/SRC/psgstrf2.c b/SRC/psgstrf2.c new file mode 100644 index 00000000..b07701b2 --- /dev/null +++ b/SRC/psgstrf2.c @@ -0,0 +1,947 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs panel LU factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified:
+ *   September 30, 2017
+ *   May 10, 2019 version 7.0.0
+ *
+ * 
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ + +#include +#include "superlu_sdefs.h" +//#include "cblas.h" + +/***************************************************************************** + * The following psgstrf2_trsm is in version 6 and earlier. + *****************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +/* This pdgstrf2 is based on TRSM function */ +void +psgstrf2_trsm + (superlu_dist_options_t * options, int_t k0, int_t k, double thresh, + Glu_persist_t * Glu_persist, gridinfo_t * grid, sLocalLU_t * Llu, + MPI_Request * U_diag_blk_send_req, int tag_ub, + SuperLUStat_t * stat, int *info) +{ + /* printf("entering psgstrf2 %d \n", grid->iam); */ + int cols_left, iam, l, pkk, pr; + int incx = 1, incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ + int luptr; + int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t *xsup = Glu_persist->xsup; + float *lusup, temp; + float *ujrow, *ublk_ptr; /* pointer to the U block */ + float alpha = -1, zero = 0.0; + int_t Pr; + MPI_Status status; + MPI_Comm comm = (grid->cscp).comm; + double t1, t2; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; +#ifdef PI_DEBUG + printf ("rank %d Iter %d k=%d \t strsm nsuper %d \n", + iam, k0, k, nsupr); +#endif + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if ( U_diag_blk_send_req && + U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) { + /* There are pending sends - wait for all Isend to complete */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pr = 0; pr < Pr; ++pr) { + if (pr != myrow) { + MPI_Wait (U_diag_blk_send_req + pr, &status); + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + /* flag no more outstanding send request. */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; + } + + if (iam == pkk) { /* diagonal process */ + /* ++++ First step compute diagonal block ++++++++++ */ + for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ + /* Diagonal pivot */ + i = luptr; + /* May replace zero pivot. */ + if (options->ReplaceTinyPivot == YES ) { + if (fabs (lusup[i]) < thresh) { /* Diagonal */ + +#if ( PRNTlevel>=2 ) + printf ("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); +#endif + /* Keep the new diagonal entry with the same sign. */ + if (lusup[i] < 0) lusup[i] = -thresh; + else lusup[i] = thresh; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); +#endif + ++(stat->TinyPivots); + } + } + +#if 0 + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ +#endif + + /* storing U in full form */ + int st; + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { + st = j * ld_ujrow + j; + ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ + } + + if ( ujrow[0] == zero ) { /* Test for singularity. */ + *info = j + jfst + 1; + } else { /* Scale the j-th column within diag. block. */ + temp = 1.0 / ujrow[0]; + for (i = luptr + 1; i < luptr - j + nsupc; ++i) + lusup[i] *= temp; + stat->ops[FACT] += nsupc - j - 1; + } + + /* Rank-1 update of the trailing submatrix within diag. block. */ + if (--cols_left) { + /* l = nsupr - j - 1; */ + l = nsupc - j - 1; /* Piyush */ + sger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, + &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], + &nsupr); + stat->ops[FACT] += 2 * l * cols_left; + } + + /* ujrow = ublk_ptr + u_diag_cnt; */ + ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ + luptr += nsupr + 1; /* move to next column */ + + } /* for column j ... first loop */ + + /* ++++ Second step compute off-diagonal block with communication ++*/ + + ublk_ptr = ujrow = Llu->ujrow; + + if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */ + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pr = 0; pr < Pr; ++pr) { + if (pr != krow) { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_FLOAT, pr, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, U_diag_blk_send_req + pr); + + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + + /* flag outstanding Isend */ + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */ + } + + /* pragma below would be changed by an MKL call */ + + l = nsupr - nsupc; + // n = nsupc; + float alpha = 1.0; +#ifdef PI_DEBUG + printf ("calling strsm\n"); + printf ("strsm diagonal param 11: %d \n", nsupr); +#endif + +#if defined (USE_VENDOR_BLAS) + strsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, + 1, 1, 1, 1); +#else + strsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); +#endif + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l; + } else { /* non-diagonal process */ + /* ================================================================== * + * Receive the diagonal block of U for panel factorization of L(:,k). * + * Note: we block for panel factorization of L(:,k), but panel * + * factorization of U(:,k) do not block * + * ================================================================== */ + + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0)); +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_FLOAT, krow, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, &status); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + if (nsupr > 0) { + float alpha = 1.0; + +#ifdef PI_DEBUG + printf ("strsm non diagonal param 11: %d \n", nsupr); + if (!lusup) + printf (" Rank :%d \t Empty block column occurred :\n", iam); +#endif +#if defined (USE_VENDOR_BLAS) + strsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); +#else + strsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); +#endif + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr; + } + + } /* end if pkk ... */ + + /* printf("exiting psgstrf2 %d \n", grid->iam); */ + +} /* PSGSTRF2_trsm */ + + + +/***************************************************************************** + * The following functions are for the new pdgstrf2_strsm in the 3D code. + *****************************************************************************/ +static +int_t LpanelUpdate(int off0, int nsupc, float* ublk_ptr, int ld_ujrow, + float* lusup, int nsupr, SCT_t* SCT) +{ + int_t l = nsupr - off0; + float alpha = 1.0; + double t1 = SuperLU_timer_(); + +#define GT 32 +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int i = 0; i < CEILING(l, GT); ++i) + { + int_t off = i * GT; + int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_strsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + + } /* for i = ... */ + + t1 = SuperLU_timer_() - t1; + + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; + SCT->trf2_time += t1; + SCT->L_PanelUpdate_tl += t1; + return 0; +} + +#pragma GCC push_options +#pragma GCC optimize ("O0") +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
+void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
+                   float *BlockUFactor, /*factored U is overwritten here*/
+                   Glu_persist_t *Glu_persist, gridinfo_t *grid, sLocalLU_t *Llu,
+                   SuperLUStat_t *stat, int *info, SCT_t* SCT)
+{
+    //double t1 = SuperLU_timer_();
+    int_t *xsup = Glu_persist->xsup;
+    float alpha = -1, zero = 0.0;
+
+    // printf("Entering dgetrf2 %d \n", k);
+    /* Initialization. */
+    int_t lk = LBj (k, grid);          /* Local block number */
+    int_t jfst = FstBlockC (k);
+    int_t jlst = FstBlockC (k + 1);
+    float *lusup = Llu->Lnzval_bc_ptr[lk];
+    int nsupc = SuperSize (k);
+    int nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+    float *ublk_ptr = BlockUFactor;
+    float *ujrow = BlockUFactor;
+    int_t luptr = 0;                  /* Point_t to the diagonal entries. */
+    int cols_left = nsupc;          /* supernode size */
+    int_t u_diag_cnt = 0;
+    int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
+    int incx = 1;
+    int incy = ld_ujrow;
+
+    for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
+    {
+        /* Diagonal pivot */
+        int_t i = luptr;
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
+        {
+            if (fabs (lusup[i]) < thresh) {  /* Diagonal */
+
+#if ( PRNTlevel>=2 )
+                    printf ("(%d) .. col %d, tiny pivot %e  ",
+                            iam, jfst + j, lusup[i]);
+#endif
+                /* Keep the new diagonal entry with the same sign. */
+                if (lusup[i] < 0) lusup[i] = -thresh;
+                else lusup[i] = thresh;
+#if ( PRNTlevel>=2 )
+                    printf ("replaced by %e\n", lusup[i]);
+#endif
+                ++(stat->TinyPivots);
+            }
+        }
+
+        for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt)
+        {
+            int_t st = j * ld_ujrow + j;
+            ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */
+        }
+
+        if (ujrow[0] == zero)   /* Test for singularity. */
+        {
+            *info = j + jfst + 1;
+        }
+        else                /* Scale the j-th column. */
+        {
+            float temp;
+            temp = 1.0 / ujrow[0];
+            for (int_t i = luptr + 1; i < luptr - j + nsupc; ++i)
+                lusup[i] *= temp;
+            stat->ops[FACT] += nsupc - j - 1;
+        }
+
+        /* Rank-1 update of the trailing submatrix. */
+        if (--cols_left)
+        {
+            /*following must be int*/
+            int l = nsupc - j - 1;
+
+	    /* Rank-1 update */
+            superlu_sger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
+            stat->ops[FACT] += 2 * l * cols_left;
+        }
+
+        ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */
+        luptr += nsupr + 1;           /* move to next column */
+
+    }                       /* for column j ...  first loop */
+
+
+    //int_t thread_id = omp_get_thread_num();
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Sgstrf2 */
+
+#pragma GCC pop_options
+/************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * nsupers (input) int_t (global)
+ *         Number of supernodes.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ * SCT    (output) SCT_t*
+ *        Additional statistics used in the 3D algorithm.
+ *
+ * 
+ */ +void psgstrf2_xtrsm +(superlu_dist_options_t *options, int_t nsupers, + int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, + gridinfo_t *grid, sLocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) +{ + int cols_left, iam, pkk; + int incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int luptr; + int_t myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t nsupc; /* number of columns in the block */ + int_t *xsup = Glu_persist->xsup; + float *lusup; + float *ujrow, *ublk_ptr; /* pointer to the U block */ + int_t Pr; + + /* Quick return. */ + *info = 0; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if (U_diag_blk_send_req && U_diag_blk_send_req[myrow]) + { + /* There are pending sends - wait for all Isend to complete */ + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + } + + if (iam == pkk) /* diagonal process */ + { + /*factorize the diagonal block*/ + Local_Sgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, + grid, Llu, stat, info, SCT); + ublk_ptr = ujrow = Llu->ujrow; + + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + { + sISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ + } + + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + else /* non-diagonal process */ + { + /* ================================================ * + * Receive the diagonal block of U * + * for panel factorization of L(:,k) * + * note: we block for panel factorization of L(:,k) * + * but panel factorization of U(:,k) don't * + * ================================================ */ + + sRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + + if (nsupr > 0) + { + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + } /* end if pkk ... */ + +} /* psgstrf2_xtrsm */ + +/***************************************************************************** + * The following functions are for the new psgstrs2_omp in the 3D code. + *****************************************************************************/ + +/* PSGSTRS2 helping kernels*/ + +int_t sTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + float* uval, float *tempv) +{ + double zero = 0.0; + int_t ncols = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + int_t segsize = klst - usub[jj]; + if ( segsize ) + { + int_t lead_zero = ldu - segsize; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; + tempv += lead_zero; + for (int_t i = 0; i < segsize; ++i) + tempv[i] = uval[rukp + i]; + rukp += segsize; + tempv += segsize; + ncols++; + } + } + return ncols; +} + +int_t sTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, float* uval, float *tempv) +{ + for (int_t jj = 0; jj < nsupc; ++jj) + { + int_t segsize = klst - usub[iukp + jj]; + if (segsize) + { + int_t lead_zero = ldu - segsize; + tempv += lead_zero; + for (int i = 0; i < segsize; ++i) + { + uval[rukp + i] = tempv[i]; + } + tempv += segsize; + rukp += segsize; + } + } /*for jj=0:nsupc */ + return 0; +} + +int_t sTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, float *uval, float *tempv, + int_t knsupc, int nsupr, float *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ +{ + float alpha = 1.0; + int_t *xsup = Glu_persist->xsup; + // int_t iukp = Ublock_info.iukp; + // int_t rukp = Ublock_info.rukp; + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + iukp += UB_DESCRIPTOR; + + // printf("klst inside task%d\n", ); + /*find ldu */ + int ldu = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; + } + + /*pack U block into a dense Block*/ + int ncols = sTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + uval, tempv); + + /*now call strsm on packed dense block*/ + int_t luptr = (knsupc - ldu) * (nsupr + 1); + // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); + + superlu_strsm("L", "L", "N", "U", ldu, ncols, alpha, + &lusup[luptr], nsupr, tempv, ldu); + + /*now scatter the output into sparse U block*/ + sTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + + return 0; +} + +/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ + +#if 1 + +/***************************************************************************** + * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. + *****************************************************************************/ +void psgstrs2_omp +(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, + sLocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) +{ +#ifdef PI_DEBUG + printf("====Entering psgstrs2==== \n"); +#endif + int iam, pkk; + int incx = 1; + int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int segsize; + int nsupc; /* number of columns in the block */ + int_t luptr, iukp, rukp; + int_t b, gb, j, klst, knsupc, lk, nb; + int_t *xsup = Glu_persist->xsup; + int_t *usub; + float *lusup, *uval; + +#if 0 + //#ifdef USE_VTUNE + __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores + __itt_resume(); // start VTune, again use 2 underscores +#endif + + /* Quick return. */ + lk = LBi (k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) return; + + /* Initialization. */ + iam = grid->iam; + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */ + //int gb_col_cycle; /* cycle through block columns */ + klst = FstBlockC (k + 1); + knsupc = SuperSize (k); + usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + uval = Llu->Unzval_br_ptr[lk]; + if (iam == pkk) { + lk = LBj (k, grid); + nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ + lusup = Llu->Lnzval_bc_ptr[lk]; + } else { + nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ + lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)]; + } + + /////////////////////new-test////////////////////////// + /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */ + + /* Master thread: set up pointers to each block in the row */ + nb = usub[0]; + iukp = BR_HEADER; + rukp = 0; + + /* Sherry: can use the existing Ublock_info[] array, call + Trs2_InitUblock_info(); */ +#undef USE_Ublock_info +#ifdef USE_Ublock_info /** 4/19/2019 **/ + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); +#else + int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); + int* blocks_value_pointers = blocks_index_pointers + nb; + int* nsupc_temp = blocks_value_pointers + nb; + for (b = 0; b < nb; b++) { /* set up pointers to each block */ + blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; + blocks_value_pointers[b] = rukp; + gb = usub[iukp]; + rukp += usub[iukp+1]; + nsupc = SuperSize( gb ); + nsupc_temp[b] = nsupc; + iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ + } +#endif + + // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c + // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for +#ifdef _OPENMP +#pragma omp parallel for schedule(static) default(shared) \ + private(b,j,iukp,rukp,segsize) +#endif + /* Loop through all the blocks in the row. */ + for (b = 0; b < nb; ++b) { +#ifdef USE_Ublock_info + iukp = Ublock_info[b].iukp; + rukp = Ublock_info[b].rukp; +#else + iukp = blocks_index_pointers[b]; + rukp = blocks_value_pointers[b]; +#endif + + /* Loop through all the segments in the block. */ +#ifdef USE_Ublock_info + gb = usub[iukp]; + nsupc = SuperSize( gb ); + iukp += UB_DESCRIPTOR; + for (j = 0; j < nsupc; j++) { +#else + for (j = 0; j < nsupc_temp[b]; j++) { +#endif + segsize = klst - usub[iukp++]; + if (segsize) { +#ifdef _OPENMP +#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30) +#endif + { /* Nonzero segment. */ + int_t luptr = (knsupc - segsize) * (nsupr + 1); + //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr); + +#if defined (USE_VENDOR_BLAS) + strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx, 1, 1, 1); +#else + strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx); +#endif + } /* end task */ + rukp += segsize; +#ifndef USE_Ublock_info + stat->ops[FACT] += segsize * (segsize + 1); +#endif + } /* end if segsize > 0 */ + } /* end for j in parallel ... */ +#ifdef _OPENMP +/* #pragma omp taskwait */ +#endif + } /* end for b ... */ + +#ifndef USE_Ublock_info + /* Deallocate memory */ + SUPERLU_FREE(blocks_index_pointers); +#endif + +#if 0 + //#ifdef USE_VTUNE + __itt_pause(); // stop VTune + __SSC_MARK(0x222); // stop SDE tracing +#endif + +} /* psgstrs2_omp */ + +#else /*==== new version from Piyush ====*/ + +void psgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, + float *Lval_buf, Glu_persist_t *Glu_persist, + gridinfo_t *grid, sLocalLU_t *Llu, SuperLUStat_t *stat, + Ublock_info_t *Ublock_info, float *bigV, int_t ldt, SCT_t *SCT) +{ + double t1 = SuperLU_timer_(); + int_t *xsup = Glu_persist->xsup; + /* Quick return. */ + int_t lk = LBi (k, grid); /* Local block number */ + + if (!Llu->Unzval_br_ptr[lk]) return; + + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + int_t knsupc = SuperSize (k); + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + float *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + float *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic,2) +#endif + for (int_t b = 0; b < nb; ++b) + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + float *tempv = bigV + thread_id * ldt * ldt; + sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); + } /* for b ... */ + + SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1); +} /* pdgstrs2_omp new version from Piyush */ + +#endif /* there are 2 versions of psgstrs2_omp */ diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c new file mode 100644 index 00000000..df3bd986 --- /dev/null +++ b/SRC/psgstrf3d.c @@ -0,0 +1,392 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs LU factorization in 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "slustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PSGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + sLocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + /* Test the input parameters. */ + *info = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter psgstrf3d()"); +#endif + + // Initilize stat + stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + +#if 0 // sherry: not used + sdiagFactBufs_t dFBuf; + sinitDiagFactBufs(ldt, &dFBuf); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + msgs_t msgs; + initMsgs(&msgs); +#endif + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + sscuBufs_t scuBufs; + sinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + sLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + + /* Initializing factorization specific buffers */ + + int_t numLA = getNumLookAhead(options); + sLUValSubBuf_t** LUvsbs = sLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + sdiagFactBufs_t** dFBufs = sinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); + + /* Setting up GPU related data structures */ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); + + sInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + + int superlu_acc_offload = HyP->superlu_acc_offload; + + //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); + HyP->bigu_size = bigu_size; + int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size (); + HyP->buffer_size = buffer_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + sLUstruct_gpu_t *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + ssluGPU_t sluGPUobj; + ssluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { +#if 0 /* Sherry: For GPU code on titan, we do not need performance + lookup tables since due to difference in CPU-GPU performance, + it didn't make much sense to do any Schur-complement update + on CPU, except for the lookahead-update on CPU. Same should + hold for summit as well. (from Piyush) */ + + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif +#endif + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); + + /* Initialize GPU data structures */ + sinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nGPUStreams = sluGPU->nGPUStreams; + } + +#endif // end GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + ssparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + ssparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + sreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + sreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */ + +#ifdef GPU_ACC + /* This frees the GPU storage allocateed in initSluGPU3D_t() */ + if (superlu_acc_offload) { + sfree_LUstruct_gpu (sluGPU->A_gpu); + } +#endif + + /* Prepare error message - find the smallesr index i that U(i,i)==0 */ + int iinfo; + if ( *info == 0 ) *info = n + 1; + MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm); + if ( iinfo == n + 1 ) *info = 0; + else *info = iinfo; + //printf("After factorization: INFO = %d\n", *info); fflush(stdout); + + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + sfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + sLluBufFreeArr(numLA, LUvsbs); + sfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit psgstrf3d()"); +#endif + return 0; + +} /* psgstrf3d */ diff --git a/SRC/psgstrs.c b/SRC/psgstrs.c new file mode 100644 index 00000000..3658bd6c --- /dev/null +++ b/SRC/psgstrs.c @@ -0,0 +1,2400 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations A*X = B with a + * general N-by-N matrix A using the LU factors computed previously. + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
+ * 
+ */ +#include +#include "superlu_sdefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + +/* + * Sketch of the algorithm for L-solve: + * ======================= + * + * Self-scheduling loop: + * + * while ( not finished ) { .. use message counter to control + * + * reveive a message; + * + * if ( message is Xk ) { + * perform local block modifications into lsum[]; + * lsum[i] -= L_i,k * X[k] + * if all local updates done, Isend lsum[] to diagonal process; + * + * } else if ( message is LSUM ) { .. this must be a diagonal process + * accumulate LSUM; + * if ( all LSUM are received ) { + * perform triangular solve for Xi; + * Isend Xi down to the current process column; + * perform local block modifications into lsum[]; + * } + * } + * } + * + * + * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array) + * ======================= + * + * lsum[] array (local) + * + lsum has "nrhs" columns, row-wise is partitioned by supernodes + * + stored by row blocks, column wise storage within a row block + * + prepend a header recording the global block number. + * + * lsum[] ilsum[nsupers + 1] + * + * ----- + * | | | <- header of size 2 --- + * --------- <--------------------| | + * | | | | | --- + * | | | | | |-----------| | + * | | | | | | --- + * --------- | |-------| | + * | | | <- header | | --- + * --------- <--------| | |----| | + * | | | | | | | --- + * | | | | | | | + * | | | | | | | + * --------- | | + * | | | <- header | | + * --------- <------------| | + * | | | | | | + * | | | | | | + * | | | | | | + * --------- <---------------| + */ + +/*#define ISEND_IRECV*/ + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ *
+ * B      (input) float*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) float*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * 
+ */ + +int_t +psReDistribute_B_to_X(float *B, int_t m_loc, int nrhs, int_t ldb, + int_t fst_row, int_t *ilsum, float *x, + sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, + gridinfo_t *grid, sSOLVEstruct_t *SOLVEstruct) +{ + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *perm_r, *perm_c; /* row and column permutation vectors */ + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + int_t *xsup, *supno; + int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; + int p, procs; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp, pps, ppr; + double t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psReDistribute_B_to_X()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + SendCnt = gstrs_comm->B_to_X_SendCnt; + SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; + RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; + sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; + rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + /* ------------------------------------------------------------ + NOW COMMUNICATE THE ACTUAL DATA. + ------------------------------------------------------------*/ + + if(procs==1){ // faster memory copy when procs=1 + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { + // t = SuperLU_timer_(); +#ifdef _OPENMP +#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied +#endif + for (i = 0; i < m_loc; ++i) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */ + + k = BlockNum( irow ); + knsupc = SuperSize( k ); + l = X_BLK( k ); + + x[l - XK_H] = k; /* Block number prepended in the header. */ + + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = B[i + j*ldb]; + } + } + } + } + }else{ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)* (size_t)nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls[p] * nrhs; + } + + /* Copy the row indices and values to the send buffer. */ + // t = SuperLU_timer_(); + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + k = ptr_to_ibuf[p]; + send_ibuf[k] = irow; + ++ptr_to_ibuf[p]; + + k = ptr_to_dbuf[p]; + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + send_dbuf[k++] = B[i + j*ldb]; + } + ptr_to_dbuf[p] += nrhs; + } + + // t = SuperLU_timer_() - t; + // printf(".. copy to send buffer time\t%8.4f\n", t); + +#if 0 + #if 1 + /* Communicate the (permuted) row indices. */ + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + /* Communicate the numerical values. */ + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + #else + /* Communicate the (permuted) row indices. */ + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i); + /* Communicate the numerical values. */ + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm, &req_d); + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); + #endif +#endif + MPI_Barrier( grid->comm ); + + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + + if(SendCnt[pps]>0){ + MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt[ppr]>0){ + MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt_nrhs[pps]>0){ + MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt_nrhs[ppr]>0){ + MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + + + /* ------------------------------------------------------------ + Copy buffer into X on the diagonal processes. + ------------------------------------------------------------*/ + + // t = SuperLU_timer_(); + ii = 0; + for (p = 0; p < procs; ++p) { + jj = rdispls_nrhs[p]; + for (i = 0; i < RecvCnt[p]; ++i) { + /* Only the diagonal processes do this; the off-diagonal processes + have 0 RecvCnt. */ + irow = recv_ibuf[ii]; /* The permuted row index. */ + k = BlockNum( irow ); + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number. */ + l = X_BLK( lk ); + x[l - XK_H] = k; /* Block number prepended in the header. */ + + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = recv_dbuf[jj++]; + } + ++ii; + } + } + + // t = SuperLU_timer_() - t; + // printf(".. copy to x time\t%8.4f\n", t); + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); + } + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psReDistribute_B_to_X()"); +#endif + return 0; +} /* psReDistribute_B_to_X */ + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * 
+ */ + +int_t +psReDistribute_X_to_B(int_t n, float *B, int_t m_loc, int_t ldb, int_t fst_row, + int_t nrhs, float *x, int_t *ilsum, + sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + sSOLVEstruct_t *SOLVEstruct) +{ + int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; + int_t *xsup, *supno; + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int iam, p, q, pkk, procs; + int_t num_diag_procs, *diag_procs; + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp,pps,ppr; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psReDistribute_X_to_B()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + iam = grid->iam; + procs = grid->nprow * grid->npcol; + + SendCnt = gstrs_comm->X_to_B_SendCnt; + SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; + RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; + sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; + rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + + if(procs==1){ //faster memory copy when procs=1 + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { + // t = SuperLU_timer_(); +#ifdef _OPENMP +#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied +#endif + for (k = 0; k < nsupers; k++) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow-fst_row +i + j*ldb] = x[l + i + j*knsupc]; + } + } + } + } + } + }else{ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + recv_dbuf = send_dbuf + k * nrhs; + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls_nrhs[p]; + } + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { + #if 0 + ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ + #else + ii = irow; + #endif + q = row_to_proc[ii]; + jj = ptr_to_ibuf[q]; + send_ibuf[jj] = ii; + jj = ptr_to_dbuf[q]; + RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ + send_dbuf[jj++] = x[l + i + j*knsupc]; + } + ++ptr_to_ibuf[q]; + ptr_to_dbuf[q] += nrhs; + ++irow; + } + } + } + } + + /* ------------------------------------------------------------ + COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. + ------------------------------------------------------------*/ +#if 0 + #if 1 + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + #else + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm,&req_d); + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); + #endif +#endif + + MPI_Barrier( grid->comm ); + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt[pps]>0){ + MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt[ppr]>0){ + MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // MPI_Barrier( grid->comm ); + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt_nrhs[pps]>0){ + MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt_nrhs[ppr]>0){ + MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // MPI_Barrier( grid->comm ); + + + /* ------------------------------------------------------------ + COPY THE BUFFER INTO B. + ------------------------------------------------------------*/ + for (i = 0, k = 0; i < m_loc; ++i) { + irow = recv_ibuf[i]; + irow -= fst_row; /* Relative row number */ + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow + j*ldb] = recv_dbuf[k++]; + } + } + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); +} +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psReDistribute_X_to_B()"); +#endif + return 0; + +} /* psReDistribute_X_to_B */ + + + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Compute the inverse of the diagonal blocks of the L and U
+ *   triangular matrices.
+ * 
+ */ +void +psCompute_Diag_Inv(int_t n, sLUstruct_t *LUstruct,gridinfo_t *grid, + SuperLUStat_t *stat, int *info) +{ +#ifdef SLU_HAVE_LAPACK + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + + float *lusup; + float *recvbuf, *tempv; + float *Linv;/* Inverse of diagonal block */ + float *Uinv;/* Inverse of diagonal block */ + + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb,nlb_nodiag, nub, nsupers; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + float **Linv_bc_ptr; + float **Uinv_bc_ptr; + int INFO; + double t; + + float one = 1.0; + float zero = 0.0; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + +#if ( PRNTlevel>=2 ) + if ( grid->iam==0 ) { + printf("computing inverse of diagonal blocks...\n"); + fflush(stdout); + } +#endif + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + Llu->inv = 1; + + /*--------------------------------------------------- + * Compute inverse of L(lk,lk). + *---------------------------------------------------*/ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* diagonal process */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + Linv = Linv_bc_ptr[lk]; + Uinv = Uinv_bc_ptr[lk]; + nsupr = lsub[1]; + knsupc = SuperSize( k ); + + for (j=0 ; j=1 ) + if( grid->iam==0 ) { + t = SuperLU_timer_() - t; + printf(".. L-diag_inv time\t%10.5f\n", t); + fflush(stdout); + } +#endif + + return; +#endif /* SLU_HAVE_LAPACK */ +} + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRS solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by PSGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PSGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +psgstrs(int_t n, sLUstruct_t *LUstruct, + sScalePermstruct_t *ScalePermstruct, + gridinfo_t *grid, float *B, + int_t m_loc, int_t fst_row, int_t ldb, int nrhs, + sSOLVEstruct_t *SOLVEstruct, + SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float beta = 0.0; + float zero = 0.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *x; /* X component at step k. */ + /* NOTE: x and lsum are of same size. */ + float *lusup, *dest; + float *recvbuf, *recvbuf_on, *tempv, + *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin; + float *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */ + float *Linv; /* Inverse of diagonal block */ + float *Uinv; /* Inverse of diagonal block */ + int *ipiv; + int_t *leaf_send; + int_t nleaf_send, nleaf_send_tmp; + int_t *root_send; + int_t nroot_send, nroot_send_tmp; + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + /*-- Data structures used for broadcast and reduction trees. --*/ + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + int_t *Urbs1; /* Number of row blocks in each block column of U. */ + int_t *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr = Llu->Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, kk, lb, ljb, lk, lib, lptr, luptr, gb, nn; + int_t nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i,maxsuper; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr, nprobe; + int nbtree, nrtree, outcount; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + float **Linv_bc_ptr; + float **Uinv_bc_ptr; + float sum; + MPI_Status status,status_on,statusx,statuslsum; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + SuperLUStat_t **stat_loc; + + double tmax; + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve -- + Count the number of local block products to + be summed into lsum[lk]. */ + int_t fmod_tmp; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t nfrecvx_buf=0; + int_t *frecv; /* Count of lsum[lk] contributions to be received + from processes in this row. + It is only valid on the diagonal processes. */ + int_t frecv_tmp; + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nfrecv = 0; /* Count of total messages to be recv'd. */ + int_t nbrecv = 0; /* Count of total messages to be recv'd. */ + int_t nleaf = 0, nroot = 0; + int_t nleaftmp = 0, nroottmp = 0; + int_t msgsize; + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t bmod_tmp; + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t nbrecvx_buf=0; + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t flagx,flaglsum,flag; + int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; + int_t TAG; + double t1_sol, t2_sol, t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t gik,iklrow,fnz; + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + int INFO, pad; + int_t tmpresult; + + // #if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + // #endif + + int_t msgcnt[4]; /* Count the size of the message xfer'd in each buffer: + * 0 : transferred in Lsub_buf[] + * 1 : transferred in Lval_buf[] + * 2 : transferred in Usub_buf[] + * 3 : transferred in Uval_buf[] + */ + int iword = sizeof (int_t); + int dword = sizeof (float); + int Nwork; + int_t procs = grid->nprow * grid->npcol; + yes_no_t done; + yes_no_t startforward; + int nbrow; + int_t ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc; + int_t lptr1_tmp, idx_i, idx_v,m; + int_t ready; + int thread_id = 0; + yes_no_t empty; + int_t sizelsum,sizertemp,aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + int num_thread = 1; + + maxsuper = sp_ienv_dist(3); + +//#ifdef _OPENMP +//#pragma omp threadprivate(thread_id) +//#endif + +#ifdef _OPENMP +#pragma omp parallel default(shared) + { + if (omp_get_thread_num () == 0) { + num_thread = omp_get_num_threads (); + } + } +#else + num_thread=1; +#endif + +#if ( PRNTlevel>=1 ) + if( grid->iam==0 ) { + printf("num_thread: %5d\n", num_thread); + fflush(stdout); + } +#endif + + MPI_Barrier( grid->comm ); + t1_sol = SuperLU_timer_(); + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PSGSTRS", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + stat->utime[SOL_COMM] = 0.0; + stat->utime[SOL_GEMM] = 0.0; + stat->utime[SOL_TRSM] = 0.0; + stat->utime[SOL_TOT] = 0.0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs()"); +#endif + + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(fmod = intMalloc_dist(nlb*aln_i)) ) + ABORT("Malloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i]; + if ( !(frecv = intCalloc_dist(nlb)) ) + ABORT("Calloc fails for frecv[]."); + Llu->frecv = frecv; + + if ( !(leaf_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) ) + ABORT("Malloc fails for leaf_send[]."); + nleaf_send=0; + if ( !(root_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) ) + ABORT("Malloc fails for root_send[]."); + nroot_send=0; + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H); + sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d; + +#ifdef _OPENMP + if ( !(lsum = (float*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(float)))) + ABORT("Malloc fails for lsum[]."); +#pragma omp parallel default(shared) private(ii,thread_id) + { + thread_id = omp_get_thread_num(); //mjc + for (ii=0; ii=2 ) + /* Dump the L factor using matlab triple-let format. */ + sDumpLblocks(iam, nsupers, grid, Glu_persist, Llu); +#endif + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + /* Redistribute B into X on the diagonal processes. */ + psReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, + ScalePermstruct, Glu_persist, grid, SOLVEstruct); + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t); + fflush(stdout); + t = SuperLU_timer_(); +#endif + + /* Set up the headers in lsum[]. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + } + } + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + nbtree = 0; + for (lk=0;lk0)nfrecvx_buf++; + } + BcTree_allocateRequest(LBtree_ptr[lk],'s'); + } + } + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + if ( !( leafsups = (int_t*)intCalloc_dist(nsupers_i)) ) + ABORT("Calloc fails for leafsups."); + + nrtree = 0; + nleaf=0; + nfrecvmod=0; + + + +if(procs==1){ + for (lk=0;lknprow; /* not sure */ + if(gbnprow; /* not sure */ + if(gb=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree); + fflush(stdout); +#endif + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); +#endif + +#if ( VAMPIR>=1 ) + // VT_initialize(); + VT_traceon(); +#endif + +#ifdef USE_VTUNE + __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores + __itt_resume(); // start VTune, again use 2 underscores +#endif + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); + fflush(stdout); +#endif + + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id = omp_get_thread_num(); +#else + { + thread_id=0; +#endif + { + + if (Llu->inv == 1) { /* Diagonal is inverted. */ + +#ifdef _OPENMP +#pragma omp for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + + nsupr = lsub[1]; + Linv = Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + +#ifdef _OPENMP +#pragma omp simd +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; + +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + // --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ + lib = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); + } + } + } + } else { /* Diagonal is not inverted. */ +#ifdef _OPENMP +#pragma omp for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + + nsupr = lsub[1]; + +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; + +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + + // --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if (LBtree_ptr[lk]!=NULL) { + lib = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } + } /* end a block */ + } /* end for jj ... */ + } /* end else ... diagonal is not inverted */ + } + } /* end parallel region */ + + jj=0; + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { + +#ifdef _OPENMP +#pragma omp master +#endif + { + +#ifdef _OPENMP +#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup +#endif + for (jj=0;jj=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s'); + } + } + + +#ifdef USE_VTUNE + __itt_pause(); // stop VTune + __SSC_MARK(0x222); // stop SDE tracing +#endif + + /* ----------------------------------------------------------- + Compute the internal nodes asynchronously by all processes. + ----------------------------------------------------------- */ + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id = omp_get_thread_num(); +#else + { + thread_id = 0; +#endif +#ifdef _OPENMP +#pragma omp master +#endif + { + for ( nfrecv =0; nfrecv=1 ) + TIC(t1); + // msgcnt[1] = maxrecvsz; +#endif + + recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + // MPI_Irecv(recvbuf0,maxrecvsz,MPI_FLOAT,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req); + // ready=0; + // while(ready==0){ + // MPI_Test(&req,&ready,&status); + // #pragma omp taskyield + // } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + + msg_cnt += 1; + msg_vol += maxrecvsz * dword; +#endif + + { + + k = *recvbuf0; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + if(status.MPI_TAG==BC_L){ + // --nfrecvx; + nfrecvx_buf++; + { + lk = LBj( k, grid ); /* local block number */ + + if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0){ + + BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + // nfrecvx_buf++; + } + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + if(myrow==krow){ + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + }else{ + nb = lsub[0]; + knsupc = SuperSize( k ); + xin = &recvbuf0[XK_H] ; + } + slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread); + + } /* if lsub */ + } + }else if(status.MPI_TAG==RD_L){ + // --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + fmod_tmp=--fmod[lk*aln_i]; + { + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( fmod_tmp==0 ) { + if(RdTree_IsRoot(LRtree_ptr[lk],'s')==YES){ + // ii = X_BLK( lk ); + knsupc = SuperSize( k ); + for (ii=1;ii=1 ) + TIC(t1); +#endif + if(Llu->inv == 1){ + Linv = Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; iinnv == 0 */ +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + } /* end if-else */ + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + } + /* + * Perform local block modifications. + */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread); + } /* if lsub */ + // } + + }else{ /* fmod_tmp != 0 */ + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + for (ii=1;ii=2 ) + t = SuperLU_timer_() - t; + stat->utime[SOL_TOT] += t; + if ( !iam ) { + printf(".. L-solve time\t%8.4f\n", t); + fflush(stdout); + } + + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. L-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel==2 ) + { + printf("(%d) .. After L-solve: y =\n", iam); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + fflush(stdout); + } + MPI_Barrier( grid->comm ); + } + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(leaf_send); + SUPERLU_FREE(leafsups); + SUPERLU_FREE(recvbuf_BC_fwd); + log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat); //account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd + + for (lk=0;lkcomm ); + +#if ( VAMPIR>=1 ) + VT_traceoff(); + VT_finalize(); +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(bmod = intMalloc_dist(nlb*aln_i)) ) + ABORT("Malloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i]; + if ( !(brecv = intCalloc_dist(nlb)) ) + ABORT("Calloc fails for brecv[]."); + Llu->brecv = brecv; + + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + + /* Re-initialize lsum to zero. Each block header is already in place. */ + +#ifdef _OPENMP + +#pragma omp parallel default(shared) private(ii) + { + int thread_id = omp_get_thread_num(); + for(ii=0;ii=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + nbtree = 0; + for (lk=0;lk0)nbrecvx_buf++; + } + BcTree_allocateRequest(UBtree_ptr[lk],'s'); + } + } + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + if ( !( rootsups = (int_t*)intCalloc_dist(nsupers_i)) ) + ABORT("Calloc fails for rootsups."); + + nrtree = 0; + nroot=0; + for (lk=0;lknprow; /* not sure */ + if(gb=2 ) + printf("(%2d) nbrecvx %4d, nbrecvmod %4d, nroot %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree); + fflush(stdout); +#endif + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); + fflush(stdout); +#endif + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp,thread_id) nogroup +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif +#ifdef _OPENMP + thread_id = omp_get_thread_num (); +#else + thread_id = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + // bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + + Uinv = Uinv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(UBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nroot_send_tmp = ++nroot_send; + root_send[(nroot_send_tmp-1)*aln_i] = lk; + + } + } /* for jj ... */ + } /* omp master region */ + } /* omp parallel region */ + + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup +#endif + for (jj=0;jj=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s'); + } +} + + /* + * Compute the internal nodes asychronously by all processes. + */ + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id=omp_get_thread_num(); +#else + { + thread_id = 0; +#endif +#ifdef _OPENMP +#pragma omp master +#endif + for ( nbrecv =0; nbrecv=1 ) + TIC(t1); +#endif + + recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + msg_cnt += 1; + msg_vol += maxrecvsz * dword; +#endif + + k = *recvbuf0; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); + fflush(stdout); +#endif + if(status.MPI_TAG==BC_U){ + // --nfrecvx; + nbrecvx_buf++; + lk = LBj( k, grid ); /* local block number */ + if(BcTree_getDestCount(UBtree_ptr[lk],'s')>0){ + + BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + // nfrecvx_buf++; + } + + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + slsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat_loc, sizelsum,sizertemp,thread_id,num_thread); + }else if(status.MPI_TAG==RD_U){ + + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[lk*aln_i]; + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( bmod_tmp==0 ) { + if(RdTree_IsRoot(URtree_ptr[lk],'s')==YES){ + + knsupc = SuperSize( k ); + for (ii=1;iiinv == 1){ + + Uinv = Uinv_bc_ptr[lk]; + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; iinv == 0 */ +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + if(UBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + } + + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat_loc, sizelsum,sizertemp,thread_id,num_thread); + + }else{ + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + + for (ii=1;ii=2 ) + t = SuperLU_timer_() - t; + stat->utime[SOL_TOT] += t; + if ( !iam ) printf(".. U-solve time\t%8.4f\n", t); + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. U-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + { + float *x_col; + int diag; + printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + diag = PNUM( krow, kcol, grid); + if ( iam == diag ) { /* Diagonal process. */ + lk = LBi( k, grid ); + jj = X_BLK( lk ); + x_col = &x[jj]; + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+i, x_col[i]); + } + x_col += knsupc; + } + } + ii += knsupc; + } /* for k ... */ + } +#endif + + psReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, + ScalePermstruct, Glu_persist, grid, SOLVEstruct); + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t); + t = SuperLU_timer_(); +#endif + + double tmp1=0; + double tmp2=0; + double tmp3=0; + double tmp4=0; + for(i=0;iutime[SOL_TRSM]); + tmp2 = SUPERLU_MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]); + tmp3 = SUPERLU_MAX(tmp3,stat_loc[i]->utime[SOL_COMM]); + tmp4 += stat_loc[i]->ops[SOLVE]; +#if ( PRNTlevel>=2 ) + f(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]); +#endif + } + + stat->utime[SOL_TRSM] += tmp1; + stat->utime[SOL_GEMM] += tmp2; + stat->utime[SOL_COMM] += tmp3; + stat->ops[SOLVE]+= tmp4; + + /* Deallocate storage. */ + for(i=0;icomm ); + + +#if ( PROFlevel>=2 ) + { + float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; + MPI_Reduce (&msg_cnt, &msg_cnt_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_cnt, &msg_cnt_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if (!iam) { + printf ("\tPDGSTRS comm stat:" + "\tAvg\tMax\t\tAvg\tMax\n" + "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", + msg_cnt_sum / Pr / Pc, msg_cnt_max, + msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); + } + } +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t1_sol; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs()"); +#endif + +#if ( PRNTlevel>=2 ) + float for_lu, total, max, avg, temp; + superlu_dist_mem_usage_t num_mem_usage; + + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + temp = num_mem_usage.total; + + MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + if (!iam) { + printf("\n** Memory Usage **********************************\n"); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } +#endif + + return; +} /* PSGSTRS */ + diff --git a/SRC/psgstrs1.c b/SRC/psgstrs1.c new file mode 100644 index 00000000..83082157 --- /dev/null +++ b/SRC/psgstrs1.c @@ -0,0 +1,910 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations + * + *
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * 
+ */ + +#include "superlu_sdefs.h" + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRS1 solves a system of distributed linear equations
+ *
+ *                   op( sub(A) ) * X = sub( B )
+ *
+ * with a general N-by-N distributed matrix sub( A ) using the LU
+ * factorization computed by PSGSTRF.
+ *
+ * This routine is used only in the iterative refinement routine
+ * psgsrfs_ABXglobal, assuming that the right-hand side is already
+ * distributed in the diagonal processes.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures to store L and U factors,
+ *        and the permutation vectors.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t' structure.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * x      (input/output) float*
+ *        On entry, the right hand side matrix.
+ *        On exit, the solution matrix if info = 0;
+ *
+ *        NOTE: the right-hand side matrix is already distributed on
+ *              the diagonal processes.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves;
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *x, int nrhs, SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *lusup, *dest; + float *recvbuf, *tempv; + float *rtemp; /* Result of full matrix-vector multiply. */ + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int iam, kcol, krow, mycol, myrow; + int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb, nub, nsupers; + int_t *xsup, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int_t Pc, Pr; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + MPI_Status status; +#ifdef ISEND_IRECV + MPI_Request *send_req, recv_req; +#endif + + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t *frecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nleaf = 0, nroot = 0; + + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for L-solve. */ + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + double t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -8; + if ( *info ) { + pxerr_dist("PSGSTRS1", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + Llu->SolveMsgSent = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs1()"); +#endif + + /* Save the count to be altered so it can be used by + subsequent call to PSGSTRS1. */ + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; + +#ifdef ISEND_IRECV + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); +#endif + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Compute ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs + + nlb * LSUM_H)) ) + ABORT("Calloc fails for lsum[]."); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); + if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for recvbuf[]."); + if ( !(rtemp = floatCalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for rtemp[]."); + + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + + /* + * Prepended the block number in the header for lsum[]. + */ + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; + } + } + + /* + * Compute frecv[] and nfrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol != kcol && fmod[lk] ) + mod_bit[lk] = 1; /* contribution from off-diagonal */ + } + } + /*PrintInt10("mod_bit", nlb, mod_bit);*/ + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* diagonal process */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && fmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); + assert( frecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); +#endif + for (k = 0; k < nsupers && nleaf; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + if ( !frecv[lk] && !fmod[lk] ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ + --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, + pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } + } /* if diagonal process ... */ + } /* for k ... */ + + /* + * Compute the internal nodes asynchronously by all processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", + iam, nfrecvx, nfrecvmod, nleaf); +#endif + + while ( nfrecvx || nfrecvmod ) { /* While not finished. */ + + /* Receive a message. */ +#ifdef ISEND_IRECV + /* -MPI- FATAL: Remote protocol queue full */ + MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &recv_req ); + MPI_Wait( &recv_req, &status ); +#else + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); +#endif + + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nfrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + nb = lsub[0]; + lptr = BC_HEADER; + luptr = 0; + knsupc = SuperSize( k ); + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if lsub */ + + break; + + case LSUM: + --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--frecv[lk])==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications. + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + } /* switch */ + + } /* while not finished ... */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + if ( !iam ) printf("\n.. After L-solve: y =\n"); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + } + MPI_Barrier( grid->comm ); + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(rtemp); + +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + Llu->SolveMsgSent = 0; +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PSGSTRS1. */ + if ( !(bmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + Llu->brecv = brecv; + + /* + * Compute brecv[] and nbrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + mod_bit[lk] = 1; /* Contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } + +#else + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* Re-initialize lsum to zero. Each block header is already in place. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; + } + } + + /* Set up additional pointers for the index and value arrays of U. + nlb is the number of local block rows. */ + nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + /* usub[0] -- number of column blocks in this block row. */ +#if ( DEBUGlevel>=2 ) + Ublocks += usub[0]; +#endif + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + ++Urbs1[ljb]; + j += usub[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + +#if ( DEBUGlevel>=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); +#endif + for (k = nsupers-1; k >= 0 && nroot; --k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + if ( !brecv[lk] && !bmod[lk] ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ + --nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if root ... */ + } /* if diagonal process ... */ + } /* for k ... */ + + + /* + * Compute the internal nodes asynchronously by all processes. + */ + while ( nbrecvx || nbrecvmod ) { /* While not finished. */ + + /* Receive a message. */ + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nbrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + + break; + + case LSUM: + --nbrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( !(--brecv[lk]) && !bmod[lk] ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii - XK_H], pi); +#endif + } + + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if becomes solvable */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + + } /* switch */ + + } /* while not finished ... */ + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t; + + /* Deallocate storage. */ + + SUPERLU_FREE(lsum); + SUPERLU_FREE(recvbuf); + for (i = 0; i < nub; ++i) + if ( Urbs[i] ) { + SUPERLU_FREE(Ucb_indptr[i]); + SUPERLU_FREE(Ucb_valptr[i]); + } + SUPERLU_FREE(Ucb_indptr); + SUPERLU_FREE(Ucb_valptr); + SUPERLU_FREE(Urbs); + SUPERLU_FREE(bmod); + SUPERLU_FREE(brecv); +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + SUPERLU_FREE(send_req); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs1()"); +#endif + +} /* PSGSTRS1 */ diff --git a/SRC/psgstrs_Bglobal.c b/SRC/psgstrs_Bglobal.c new file mode 100644 index 00000000..eb972a16 --- /dev/null +++ b/SRC/psgstrs_Bglobal.c @@ -0,0 +1,1040 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization + * + *
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * 
+ */ + +#include "superlu_sdefs.h" + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif +static void gather_diag_to_all(int_t, int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t, int_t [], + int_t [], float [], int_t, float []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by psgstrf.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *B, int_t ldb, int nrhs, + SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *x; /* X component at step k. */ + float *lusup, *dest; + float *recvbuf, *tempv; + float *rtemp; /* Result of full matrix-vector multiply. */ + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb, nub, nsupers; + int_t *xsup, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + MPI_Status status; +#if defined (ISEND_IRECV) || defined (BSEND) + MPI_Request *send_req, recv_req; +#endif + + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t *frecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nleaf = 0, nroot = 0; + + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for L-solve. */ + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + double t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PSGSTRS_BGLOBAL", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs_Bglobal()"); +#endif + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS_BGLOBAL. */ + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; + +#if defined (ISEND_IRECV) || defined (BSEND) + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); +#endif + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs + + nlb * LSUM_H)) ) + ABORT("Calloc fails for lsum[]."); + if ( !(x = floatMalloc_dist(((size_t)ldalsum) * nrhs + + nlb * XK_H)) ) + ABORT("Malloc fails for x[]."); + if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for recvbuf[]."); + if ( !(rtemp = floatCalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for rtemp[]."); + + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + + /* + * Copy B into X on the diagonal processes. + */ + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* Diagonal process. */ + jj = X_BLK( lk ); + x[jj - XK_H] = k; /* Block number prepended in the header. */ + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ + x[i + jj + j*knsupc] = B[i + ii + j*ldb]; + } + } + ii += knsupc; + } + + /* + * Compute frecv[] and nfrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); + if ( mycol != kcol && fmod[lk] ) + mod_bit[lk] = 1; /* contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && fmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); + assert( frecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); +#endif + for (k = 0; k < nsupers && nleaf; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + if ( frecv[lk]==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, + pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req,stat); + } + } /* if diagonal process ... */ + } /* for k ... */ + + /* ----------------------------------------------------------- + Compute the internal nodes asynchronously by all processes. + ----------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", + iam, nfrecvx, nfrecvmod, nleaf); +#endif + + while ( nfrecvx || nfrecvmod ) { /* While not finished. */ + + /* Receive a message. */ +#ifdef ISEND_IRECV + /* -MPI- FATAL: Remote protocol queue full */ + MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &recv_req ); + MPI_Wait( &recv_req, &status ); +#else + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); +#endif + + k = *recvbuf; + + + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nfrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + nb = lsub[0]; + lptr = BC_HEADER; + luptr = 0; + knsupc = SuperSize( k ); + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if lsub */ + + break; + + case LSUM: /* Receiver must be a diagonal process */ + --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--frecv[lk])==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + } /* switch */ + + } /* while not finished ... */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + printf("\n(%d) .. After L-solve: y =\n", iam); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + } + MPI_Barrier( grid->comm ); + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(rtemp); + +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + Llu->SolveMsgSent = 0; +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS_BGLOBAL. */ + if ( !(bmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + Llu->brecv = brecv; + + /* + * Compute brecv[] and nbrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + mod_bit[lk] = 1; /* Contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* Re-initialize lsum to zero. Each block header is already in place. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; + } + } + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + /* usub[0] -- number of column blocks in this block row. */ +#if ( DEBUGlevel>=2 ) + Ublocks += usub[0]; +#endif + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + ++Urbs1[ljb]; + j += usub[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + +#if ( DEBUGlevel>=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); +#endif + for (k = nsupers-1; k >= 0 && nroot; --k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + if ( brecv[lk]==0 && bmod[lk]==0 ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + --nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) { + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if root ... */ + } /* if diagonal process ... */ + } /* for k ... */ + + + /* + * Compute the internal nodes asynchronously by all processes. + */ + while ( nbrecvx || nbrecvmod ) { /* While not finished. */ + + /* Receive a message. */ + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); + + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nbrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + + break; + + case LSUM: /* Receiver must be a diagonal process */ + --nbrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--brecv[lk])==0 && bmod[lk]==0 ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) { + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii - XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if becomes solvable */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + + } /* switch */ + + } /* while not finished ... */ + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); +#endif + + + /* Copy the solution X into B (on all processes). */ + { + int_t num_diag_procs, *diag_procs, *diag_len; + float *work; + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); + if ( !(work = floatMalloc_dist(((size_t)jj)*nrhs)) ) + ABORT("Malloc fails for work[]"); + gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, + grid, num_diag_procs, diag_procs, diag_len, + B, ldb, work); + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + SUPERLU_FREE(work); + } + + /* Deallocate storage. */ + + SUPERLU_FREE(lsum); + SUPERLU_FREE(x); + SUPERLU_FREE(recvbuf); + for (i = 0; i < nub; ++i) + if ( Urbs[i] ) { + SUPERLU_FREE(Ucb_indptr[i]); + SUPERLU_FREE(Ucb_valptr[i]); + } + SUPERLU_FREE(Ucb_indptr); + SUPERLU_FREE(Ucb_valptr); + SUPERLU_FREE(Urbs); + SUPERLU_FREE(bmod); + SUPERLU_FREE(brecv); +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + SUPERLU_FREE(send_req); +#endif +#ifdef BSEND + SUPERLU_FREE(send_req); +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs_Bglobal()"); +#endif + +} /* PSGSTRS_BGLOBAL */ + + +/* + * Gather the components of x vector on the diagonal processes + * onto all processes, and combine them into the global vector y. + */ +static void +gather_diag_to_all(int_t n, int_t nrhs, float x[], + Glu_persist_t *Glu_persist, sLocalLU_t *Llu, + gridinfo_t *grid, int_t num_diag_procs, + int_t diag_procs[], int_t diag_len[], + float y[], int_t ldy, float work[]) +{ + int_t i, ii, j, k, lk, lwork, nsupers, p; + int_t *ilsum, *xsup; + int iam, knsupc, pkk; + float *x_col, *y_col; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy x vector into a buffer. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ + x_col = &x[ii]; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; + lwork += knsupc; + x_col += knsupc; + } + } + MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( work, diag_len[p]*nrhs, MPI_FLOAT, pkk, grid->comm ); + } + /* Scatter work[] into global y vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + ii = FstBlockC( k ); + y_col = &y[ii]; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; + lwork += knsupc; + y_col += ldy; + } + } + } +} /* GATHER_DIAG_TO_ALL */ + diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c new file mode 100644 index 00000000..ead09ab5 --- /dev/null +++ b/SRC/psgstrs_lsum.c @@ -0,0 +1,2126 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
+ * 
+ */ + +#include "superlu_sdefs.h" +#include "superlu_defs.h" + +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t lptr, /* Starting position in lsub[*]. */ + int_t luptr, /* Starting position in lusup[*]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t *stat +) +{ + float alpha = 1.0, beta = 0.0; + float *lusup, *lusup1; + float *dest; + int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; + int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + MPI_Status status; + int test_flag; + +#if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; +#endif +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + lusup = Llu->Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + for (lb = 0; lb < nlb; ++lb) { + ik = lsub[lptr]; /* Global block number, row-wise. */ + nbrow = lsub[lptr+1]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow, 1, 1 ); +#else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#endif + stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + lptr += LB_DESCRIPTOR; + rel = xsup[ik]; /* Global row index of block ik. */ + for (i = 0; i < nbrow; ++i) { + irow = lsub[lptr++] - rel; /* Relative row. */ + RHS_ITERATE(j) + dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; + } + luptr += nbrow; + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_GEMM] += t2; +#endif + + if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ + ikcol = PCOL( ik, grid ); + p = PNUM( myrow, ikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( lk ); + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#endif +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_TRSM] += t2; +#endif + + stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, ikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + nlb1 = lsub1[0] - 1; + lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; + luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, lptr1, luptr1, xsup, + grid, Llu, send_req, stat); + } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + + } /* for lb ... */ + +} /* sLSUM_FMOD */ + + +/************************************************************************/ +void slsum_bmod +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t *stat + ) +{ +/* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + MPI_Status status; + int test_flag; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + + if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( ik ); + dest = &x[ii]; + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( !brecv[ik] ) { /* Becomes a leaf node. */ + bmod[ik] = -1; /* Do not solve X[k] in the future. */ + lk1 = LBj( gik, grid ); /* Local block number. */ + lsub = Llu->Lrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#endif + stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( bsendx_plist[lk1][p] != EMPTY ) { + pi = PNUM( p, gikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + if ( Urbs[lk1] ) + slsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ + +} /* slSUM_BMOD */ + + + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod_inv +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t *leaf_send, + int_t *nleaf_send, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel, + int_t maxsuper, + int thread_id, + int num_thread +) +{ + float alpha = 1.0, beta = 0.0,malpha=-1.0; + float *lusup, *lusup1; + float *dest; + float *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id1; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + float *rtemp_loc; + int_t ldalsum; + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + int knsupc; /* Size of supernode k. */ + int_t nlb; /* Number of L blocks. */ + + + knsupc = SuperSize( k ); + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + nlb = lsub[0] - 1; + + + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + + iam = grid->iam; + myrow = MYROW( iam, grid ); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>8*maxsuper){ + // if(0){ + + // Nchunk=floor(num_thread/2.0)+1; + Nchunk=SUPERLU_MIN(num_thread,nlb); + // Nchunk=1; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); +#else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); +#endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } /* endd for lb ... */ + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + for (lb=lbstart;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; + + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); + +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } + + /* + * Perform local block modifications. + */ + +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { + + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ + } /* if fmod[lk] == 0 */ + } + + } /* end tasklook for nn ... */ + } + + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } /* end for lb ... */ + + // TOC(t3, t1); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; +#endif + + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); + } + + /* + * Perform local block modifications. + */ + +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* end else iam == p */ + } /* if fmod[lk] == 0 */ + } + // } +} + + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + + +} /* if nlb>0*/ +} /* sLSUM_FMOD_INV */ + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod_inv_master +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel, + int_t maxsuper, + int thread_id, + int num_thread +) +{ + float alpha = 1.0, beta = 0.0, malpha=-1.0; + float *lusup, *lusup1; + float *dest; + float *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id1; + int m; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + float *rtemp_loc; + int_t ldalsum; + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lsub = Llu->Lrowind_bc_ptr[lk]; + + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>4*maxsuper || nrhs>10){ + // if(m<1){ + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } /* end for lb ... */ + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } /* end if (lbstart=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } /* end for lb ... */ +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + } /* end else ... */ + // TOC(t3, t1); + rtemp_loc = &rtemp[sizertemp* thread_id]; + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; +#endif + + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL) + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + + /* + * Perform local block modifications. + */ + +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { + nlb1 = lsub1[0] - 1; + + slsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + } /* end if nlb>0*/ +} /* end slsum_fmod_inv_master */ + + + +/************************************************************************/ +void slsum_bmod_inv +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t* root_send, + int_t* nroot_send, + int_t sizelsum, + int_t sizertemp, + int thread_id, + int num_thread + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id1; + float *rtemp_loc; + int_t nroot_send_tmp; + float *Uinv;/* Inverse of diagonal block */ + float temp; + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + int_t iword = sizeof(int_t); + int_t dword = sizeof(float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + if(Llu->Unnz[lk]>knsupc*64 || nub>16){ + // if(nub>num_thread){ + // if(nub>16){ + // // // // if(Urbs2[lk]>num_thread){ + // if(Urbs2[lk]>0){ + Nchunk=SUPERLU_MIN(num_thread,nub); + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + // printf("Unnz: %5d nub: %5d knsupc: %5d\n",Llu->Unnz[lk],nub,knsupc); +#ifdef _OPENMP +#pragma omp taskloop firstprivate (stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup +#endif + for (nn=0;nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* end for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; iUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif + for (irow = fnz; irow < iklrow; ++irow) + + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif + // slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + // stat, root_send, nroot_send, sizelsum,sizertemp); + //}else{ + slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + //} + + // } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* end for ub ... */ + } /* end else ... */ + +} /* slSUM_BMOD_inv */ + + + +/************************************************************************/ +void slsum_bmod_inv_master +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int thread_id, + int num_thread + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id1; + float *rtemp_loc; + float temp; + float *Uinv;/* Inverse of diagonal block */ + + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); + // fflush(stdout); + + if(nub>num_thread){ + // if(nub>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +//#ifdef _OPENMP +//#pragma omp taskloop firstprivate (stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied +//#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + + if(nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + }else{ + rtemp_loc = &rtemp[sizertemp* thread_id]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + } + + + rtemp_loc = &rtemp[sizertemp* thread_id]; + for (ub = 0; ub < nub; ++ub){ + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i + * File name: pslangs.c + * History: Modified from lapack routine SLANGE + *
+ */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSLANGS returns the value of the one norm, or the Frobenius norm, or
+    the infinity norm, or the element of largest absolute value of a
+    real matrix A.
+
+    Description
+    ===========
+
+    PSLANGE returns the value
+
+       PSLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                 (
+                 ( norm1(A),         NORM = '1', 'O' or 'o'
+                 (
+                 ( normI(A),         NORM = 'I' or 'i'
+                 (
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described above.
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A.
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+   =====================================================================
+
+*/ + +float pslangs(char *norm, SuperMatrix *A, gridinfo_t *grid) +{ + /* Local variables */ + NRformat_loc *Astore; + int_t m_loc; + float *Aval; + int_t i, j, jcol; + float value=0., sum; + float *rwork; + float tempvalue; + float *temprwork; + + Astore = (NRformat_loc *) A->Store; + m_loc = Astore->m_loc; + Aval = (float *) Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + } else if ( strncmp(norm, "M", 1)==0 ) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + value = SUPERLU_MAX( value, fabs(Aval[j]) ); + } + + MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm); + value = tempvalue; + + } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; +#if 0 + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } +#else /* Sherry ==> */ + if ( !(rwork = floatCalloc_dist(A->ncol)) ) + ABORT("floatCalloc_dist fails for rwork."); + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + rwork[jcol] += fabs(Aval[j]); + } + } + + if ( !(temprwork = floatCalloc_dist(A->ncol)) ) + ABORT("floatCalloc_dist fails for temprwork."); + MPI_Allreduce(rwork, temprwork, A->ncol, MPI_FLOAT, MPI_SUM, grid->comm); + value = 0.; + for (j = 0; j < A->ncol; ++j) { + value = SUPERLU_MAX(value, temprwork[j]); + } + SUPERLU_FREE (temprwork); + SUPERLU_FREE (rwork); +#endif + } else if ( strncmp(norm, "I", 1)==0 ) { + /* Find normI(A). */ + value = 0.; + sum = 0.; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + sum += fabs(Aval[j]); + value = SUPERLU_MAX(value, sum); + } + MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm); + value = tempvalue; + + } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) { + /* Find normF(A). */ + ABORT("Not implemented."); + } else { + ABORT("Illegal norm specified."); + } + + return (value); + +} /* pslangs */ diff --git a/SRC/pslaqgs.c b/SRC/pslaqgs.c new file mode 100644 index 00000000..3fbf9451 --- /dev/null +++ b/SRC/pslaqgs.c @@ -0,0 +1,151 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Equilibrates a general sparse M by N matrix + * + *
+ * File name:	pslaqgs.c
+ * History:     Modified from LAPACK routine SLAQGE
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSLAQGS equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (input) float*, dimension (A->nrow)
+            The row scale factors for A.
+
+    C       (input) float*, dimension (A->ncol)
+            The column scale factors for A.
+
+    ROWCND  (input) float
+            Ratio of the smallest R(i) to the largest R(i).
+
+    COLCND  (input) float
+            Ratio of the smallest C(i) to the largest C(i).
+
+    AMAX    (input) float
+            Absolute value of largest matrix entry.
+
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.
+            = 'N':  No equilibration
+            = 'R':  Row equilibration, i.e., A has been premultiplied by
+                    diag(R).
+            = 'C':  Column equilibration, i.e., A has been postmultiplied
+                    by diag(C).
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).
+
+    Internal Parameters
+    ===================
+
+    THRESH is a threshold value used to decide if row or column scaling
+    should be done based on the ratio of the row or column scaling
+    factors.  If ROWCND < THRESH, row scaling is done, and if
+    COLCND < THRESH, column scaling is done.
+
+    LARGE and SMALL are threshold values used to decide if row scaling
+    should be done based on the absolute size of the largest matrix
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.
+
+    =====================================================================
+
+*/ + +void +pslaqgs(SuperMatrix *A, float *r, float *c, + float rowcnd, float colcnd, float amax, char *equed) +{ + +#define THRESH (0.1) + + /* Local variables */ + NRformat_loc *Astore; + float *Aval; + int_t i, j, irow, jcol, m_loc; + float large, small; + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + m_loc = Astore->m_loc; + + /* Initialize LARGE and SMALL. */ + small = smach_dist("Safe minimum") / smach_dist("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + Aval[j] *= c[jcol]; + } + ++irow; + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + Aval[j] *= r[irow]; + ++irow; + } + *(unsigned char *)equed = 'R'; + } else { + /* Both row and column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + Aval[j] = Aval[j] * r[irow] * c[jcol]; + } + ++irow; + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* pslaqgs */ + diff --git a/SRC/pssymbfact_distdata.c b/SRC/pssymbfact_distdata.c new file mode 100644 index 00000000..1488ee18 --- /dev/null +++ b/SRC/pssymbfact_distdata.c @@ -0,0 +1,2831 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Redistribute the symbolic structure of L and U from the distribution + * + *
+ * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
+ * -- Distributes the data from parallel symbolic factorization
+ * -- to numeric factorization
+ * INRIA France -  July 1, 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ * 
+ */ + +/* limits.h: the largest positive integer (INT_MAX) */ +#include + +#include "superlu_sdefs.h" +#include "psymbfact.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Redistribute the symbolic structure of L and U from the distribution
+ * used in the parallel symbolic factorization step to the distdibution
+ * used in the parallel numeric factorization step.  On exit, the L and U
+ * structure for the 2D distribution used in the numeric factorization step is
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal
+ * information is also computed and it is stored in Glu_persist->supno
+ * and Glu_persist->xsup.
+ *
+ * This routine allocates memory for storing the structure of L and U
+ * and the supernodes information.  This represents the arrays:
+ * p_xlsub, p_lsub, p_xusub, p_usub,
+ * Glu_persist->supno,  Glu_persist->xsup.
+ *
+ * This routine also deallocates memory allocated during symbolic
+ * factorization routine.  That is, the folloing arrays are freed:
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub,
+ * Pslu_freeable->xusub, Pslu_freeable->usub,
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
+ * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (Input) int_t
+ *        Order of the input matrix
+ * Pslu_freeable  (Input) Pslu_freeable_t *
+ *        Local L and U structure,
+ *        global to local indexing information.
+ *
+ * Glu_persist (Output) Glu_persist_t *
+ *        Stores on output the information on supernodes mapping.
+ *
+ * p_xlsub (Output) int_t **
+ *         Pointer to structure of L distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_lsub  (Output) int_t **
+ *         Structure of L distributed on a 2D grid of processors,
+ *         stored by columns.
+ *
+ * p_xusub (Output) int_t **
+ *         Pointer to structure of U distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_usub  (Output) int_t **
+ *         Structure of U distributed on a 2D grid of processors,
+ *         stored by rows.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU.
+ *   > 0, number of bytes allocated in this routine when out of memory.
+ *        (an approximation).
+ * 
+ */ + +static float +dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, + Glu_persist_t *Glu_persist, + int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, + gridinfo_t *grid + ) +{ + int iam, nprocs, pc, pr, p, np, p_diag; + int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, + *tmp_ptrToSend, *mem; + int_t *nnzToRecv_l, *nnzToRecv_u; + int_t *send_1, *send_2, nsend_1, nsend_2; + int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; + int_t nsupers, nsupers_i, nsupers_j; + int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; + int_t maxszsn, maxNvtcsPProc; + int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; + int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; + int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; + int_t *xsub_s, *sub_s, *xsub_n, *sub_n; + int_t *globToLoc, nvtcs_loc; + int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, + RecvCnt_l, RecvCnt_u, ind_loc; + int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; + int_t nelts, isize; + float memAux; /* Memory used during this routine and freed on return */ + float memRet; /* Memory allocated and not freed on return */ + int_t iword, dword; + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dist_symbLU()"); +#endif + nprocs = (int) grid->nprow * grid->npcol; + xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; + xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; + maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; + globToLoc = Pslu_freeable->globToLoc; + nvtcs_loc = Pslu_freeable->nvtcs_loc; + xsup_beg_s = Pslu_freeable->xsup_beg_loc; + xsup_end_s = Pslu_freeable->xsup_end_loc; + supno_s = Pslu_freeable->supno_loc; + rcv_luind = NULL; + iword = sizeof(int_t); + dword = sizeof(float); + memAux = 0.; memRet = 0.; + + mem = intCalloc_dist(12 * nprocs); + if (!mem) + return (ERROR_RET); + memAux = (float) (12 * nprocs * sizeof(int_t)); + nnzToRecv = mem; + nnzToSend = nnzToRecv + 2*nprocs; + nnzToSend_l = nnzToSend + 2 * nprocs; + nnzToSend_u = nnzToSend_l + nprocs; + send_1 = nnzToSend_u + nprocs; + send_2 = send_1 + nprocs; + tmp_ptrToSend = send_2 + nprocs; + nnzToRecv_l = tmp_ptrToSend + nprocs; + nnzToRecv_u = nnzToRecv_l + nprocs; + + ptrToSend = nnzToSend; + ptrToRecv = nnzToSend + nprocs; + + nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); + intBuf1 = nvtcs + nprocs; + intBuf2 = nvtcs + 2 * nprocs; + intBuf3 = nvtcs + 3 * nprocs; + intBuf4 = nvtcs + 4 * nprocs; + memAux += 5 * nprocs * sizeof(int); + + maxszsn = sp_ienv_dist(3); + + /* Allocate space for storing Glu_persist_n. */ + if ( !(supno_n = intMalloc_dist(n+1)) ) { + fprintf (stderr, "Malloc fails for supno_n[]."); + return (memAux); + } + memRet += (float) ((n+1) * sizeof(int_t)); + + /* ------------------------------------------------------------ + DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION + ------------------------------------------------------------*/ + + if (nvtcs_loc > INT_MAX) + ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); + intNvtcs_loc = (int) nvtcs_loc; + MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, + 0, grid->comm); + + if (!iam) { + /* set ptrToRecv to point to the beginning of the data for + each processor */ + for (k = 0, p = 0; p < nprocs; p++) { + ptrToRecv[p] = k; + k += nvtcs[p]; + } + } + + if (nprocs > 1) { + temp = NULL; + if (!iam ) { + if ( !(temp = intMalloc_dist (n+1)) ) { + fprintf (stderr, "Malloc fails for temp[]."); + return (memAux + memRet); + } + memAux += (float) (n+1) * iword; + } +#if defined (_LONGINT) + for (p=0; p INT_MAX) + ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); + intBuf1[p] = (int) ptrToRecv[p]; + } +#else /* Default */ + intBuf1 = ptrToRecv; +#endif + MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, + temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); + } + else + temp = supno_s; + + if (!iam) { + nsupers = 0; + p = (int) OWNER( globToLoc[0] ); + gb = temp[ptrToRecv[p]]; + supno_n[0] = nsupers; + ptrToRecv[p] ++; + szsn = 1; + for (j = 1; j < n; j ++) { + if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { + nsupers ++; + p = (int) OWNER( globToLoc[j] ); + gb = temp[ptrToRecv[p]]; + szsn = 1; + } + else { + szsn ++; + } + ptrToRecv[p] ++; + supno_n[j] = nsupers; + } + nsupers++; + if (nprocs > 1) { + SUPERLU_FREE (temp); + memAux -= (float) (n+1) * iword; + } + supno_n[n] = nsupers; + } + + /* reset to 0 nnzToSend */ + for (p = 0; p < 2 *nprocs; p++) + nnzToSend[p] = 0; + + MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); + nsupers = supno_n[n]; + /* Allocate space for storing Glu_persist_n. */ + if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { + fprintf (stderr, "Malloc fails for xsup_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers+1) * iword; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF L and U. + ------------------------------------------------------------*/ + gb = EMPTY; + for (i = 0; i < n; i++) { + if (gb != supno_n[i]) { + /* a new supernode starts */ + gb = supno_n[i]; + xsup_n[gb] = i; + } + } + xsup_n[nsupers] = n; + + for (p = 0; p < nprocs; p++) { + send_1[p] = FALSE; + send_2[p] = FALSE; + } + for (gb_n = 0; gb_n < nsupers; gb_n ++) { + i = xsup_n[gb_n]; + if (iam == (int) OWNER( globToLoc[i] )) { + pc = PCOL( gb_n, grid ); + pr = PROW( gb_n, grid ); + p_diag = PNUM( pr, pc, grid); + + i_loc = LOCAL_IND( globToLoc[i] ); + gb_s = supno_s[i_loc]; + fst_s = xsup_beg_s[gb_s]; + lst_s = xsup_end_s[gb_s]; + fst_s_l = LOCAL_IND( globToLoc[fst_s] ); + for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { + k = lsub_s[j]; + if (k >= i) { + gb = supno_n[k]; + p = (int) PNUM( PROW(gb, grid), pc, grid ); + nnzToSend[2*p] ++; + send_1[p] = TRUE; + } + } + for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { + k = usub_s[j]; + if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { + gb = supno_n[k]; + p = PNUM( pr, PCOL(gb, grid), grid); + nnzToSend[2*p+1] ++; + send_2[p] = TRUE; + } + } + + nsend_2 = 0; + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { + nnzToSend[2*p+1] += 2; + if (send_2[p]) nsend_2 ++; + } + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) + if (send_2[p] || p == p_diag) { + if (p == p_diag && !send_2[p]) + nnzToSend[2*p+1] += nsend_2; + else + nnzToSend[2*p+1] += nsend_2-1; + send_2[p] = FALSE; + } + nsend_1 = 0; + for (p = pc; p < nprocs; p += grid->npcol) { + nnzToSend[2*p] += 2; + if (send_1[p]) nsend_1 ++; + } + for (p = pc; p < nprocs; p += grid->npcol) + if (send_1[p]) { + nnzToSend[2*p] += nsend_1-1; + send_1[p] = FALSE; + } + else + nnzToSend[2*p] += nsend_1; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, + grid->comm); + + nnz_loc_l = nnz_loc_u = 0; + SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; + for (p = 0; p < nprocs; p++) { + if ( p != iam ) { + SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; + SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; + RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; + RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; + } else { + nnz_loc_l += nnzToRecv[2*p]; + nnz_loc_u += nnzToRecv[2*p+1]; + nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; + nnzToRecv_l[p] = nnzToRecv[2*p]; + nnzToRecv_u[p] = nnzToRecv[2*p+1]; + } + } + + /* Allocate space for storing the symbolic structure after redistribution. */ + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { + fprintf (stderr, "Malloc fails for xlsub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers_j+1) * iword; + + if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { + fprintf (stderr, "Malloc fails for xusub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers_i+1) * iword; + + /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ + if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { + if (!(rcv_luind = + intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { + fprintf (stderr, "Malloc fails for rcv_luind[]."); + return (memAux + memRet); + } + memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) + * iword; + } + if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { + if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { + fprintf (stderr, "Malloc fails for index[]."); + return (memAux + memRet); + } + memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; + } + + /* ------------------------------------------------------------------ + LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF L and U. + ------------------------------------------------------------------*/ + sendL = TRUE; + sendU = FALSE; + while (sendL || sendU) { + if (sendL) { + xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; + nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; + } + if (sendU) { + xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; + nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; + } + for (i = 0, j = 0, p = 0; p < nprocs; p++) { + if ( p != iam ) { + ptrToSend[p] = i; i += nnzToSend[p]; + } + ptrToRecv[p] = j; j += nnzToRecv[p]; + } + nnzToRecv[iam] = 0; + + ind_loc = ptrToRecv[iam]; + for (gb_n = 0; gb_n < nsupers; gb_n++) { + nsend_2 = 0; + i = xsup_n[gb_n]; + if (iam == OWNER( globToLoc[i] )) { + pc = PCOL( gb_n, grid ); + pr = PROW( gb_n, grid ); + p_diag = PNUM( pr, pc, grid ); + + i_loc = LOCAL_IND( globToLoc[i] ); + gb_s = supno_s[i_loc]; + fst_s = xsup_beg_s[gb_s]; + lst_s = xsup_end_s[gb_s]; + fst_s_l = LOCAL_IND( globToLoc[fst_s] ); + + if (sendL) { + p = pc; np = grid->nprow; + } else { + p = pr * grid->npcol; np = grid->npcol; + } + for (j = 0; j < np; j++) { + if (p == iam) { + rcv_luind[ind_loc] = gb_n; + rcv_luind[ind_loc+1] = 0; + tmp_ptrToSend[p] = ind_loc + 1; + ind_loc += 2; + } + else { + snd_luind[ptrToSend[p]] = gb_n; + snd_luind[ptrToSend[p]+1] = 0; + tmp_ptrToSend[p] = ptrToSend[p] + 1; + ptrToSend[p] += 2; + } + if (sendL) p += grid->npcol; + if (sendU) p++; + } + for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { + k = sub_s[j]; + if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { + gb = supno_n[k]; + if (sendL) + p = PNUM( PROW(gb, grid), pc, grid ); + else + p = PNUM( pr, PCOL(gb, grid), grid); + if (send_1[p] == FALSE) { + send_1[p] = TRUE; + send_2[nsend_2] = k; nsend_2 ++; + } + if (p == iam) { + rcv_luind[ind_loc] = k; ind_loc++; + if (sendL) + xsub_n[LBj( gb_n, grid )] ++; + else + xsub_n[LBi( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = k; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + if (sendL) + for (p = pc; p < nprocs; p += grid->npcol) { + for (k = 0; k < nsend_2; k++) { + gb = supno_n[send_2[k]]; + if (PNUM(PROW(gb, grid), pc, grid) != p) { + if (p == iam) { + rcv_luind[ind_loc] = send_2[k]; ind_loc++; + xsub_n[LBj( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = send_2[k]; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + send_1[p] = FALSE; + } + if (sendU) + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { + if (send_1[p] || p == p_diag) { + for (k = 0; k < nsend_2; k++) { + gb = supno_n[send_2[k]]; + if(PNUM( pr, PCOL(gb, grid), grid) != p) { + if (p == iam) { + rcv_luind[ind_loc] = send_2[k]; ind_loc++; + xsub_n[LBi( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = send_2[k]; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + send_1[p] = FALSE; + } + } + } + } + + /* reset ptrToSnd to point to the beginning of the data for + each processor (structure needed in MPI_Alltoallv) */ + for (i = 0, p = 0; p < nprocs; p++) { + ptrToSend[p] = i; i += nnzToSend[p]; + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + Note: it uses MPI_Alltoallv. + ------------------------------------------------------------*/ + if (nprocs > 1) { +#if defined (_LONGINT) + nnzToSend[iam] = 0; + for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || + nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) + ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); + intBuf1[p] = (int) nnzToSend[p]; + intBuf2[p] = (int) ptrToSend[p]; + intBuf3[p] = (int) nnzToRecv[p]; + intBuf4[p] = (int) ptrToRecv[p]; + } +#else /* Default */ + intBuf1 = nnzToSend; intBuf2 = ptrToSend; + intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; +#endif + + MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, + rcv_luind, intBuf3, intBuf4, mpi_int_t, + grid->comm); + } + if (sendL) + nnzToRecv[iam] = nnz_loc_l; + else + nnzToRecv[iam] = nnz_loc_u; + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE. + -------------------------------------------------------------*/ + if (sendU) + if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { + SUPERLU_FREE (snd_luind); + memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; + } + + /* ------------------------------------------------------------ + CONVERT THE FORMAT. + ------------------------------------------------------------*/ + /* Initialize the array of column of L/ row of U pointers */ + k = 0; + for (p = 0; p < nprocs; p ++) { + if (p != iam) { + i = k; + while (i < k + nnzToRecv[p]) { + gb = rcv_luind[i]; + nelts = rcv_luind[i+1]; + if (sendL) + xsub_n[LBj( gb, grid )] = nelts; + else + xsub_n[LBi( gb, grid )] = nelts; + i += nelts + 2; + } + } + k += nnzToRecv[p]; + } + + if (sendL) j = nsupers_j; + else j = nsupers_i; + k = 0; + isize = xsub_n[0]; + xsub_n[0] = 0; + for (gb_l = 1; gb_l < j; gb_l++) { + k += isize; + isize = xsub_n[gb_l]; + xsub_n[gb_l] = k; + } + xsub_n[gb_l] = k + isize; + nnz_loc = xsub_n[gb_l]; + if (sendL) { + lsub_n = NULL; + if (nnz_loc) { + if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { + fprintf (stderr, "Malloc fails for lsub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc * iword); + } + sub_n = lsub_n; + } + if (sendU) { + usub_n = NULL; + if (nnz_loc) { + if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { + fprintf (stderr, "Malloc fails for usub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc * iword); + } + sub_n = usub_n; + } + + /* Copy the data into the L column / U row oriented storage */ + k = 0; + for (p = 0; p < nprocs; p++) { + i = k; + while (i < k + nnzToRecv[p]) { + gb = rcv_luind[i]; + if (gb >= nsupers) + printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n", + iam, p, gb, nsupers, i, i-k); + i += 2; + if (sendL) gb_l = LBj( gb, grid ); + if (sendU) gb_l = LBi( gb, grid ); + for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { + sub_n[j] = rcv_luind[i]; + } + } + k += nnzToRecv[p]; + } + if (sendL) { + sendL = FALSE; sendU = TRUE; + } + else + sendU = FALSE; + } + + /* deallocate memory allocated during symbolic factorization routine */ + if (rcv_luind != NULL) { + SUPERLU_FREE (rcv_luind); + memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; + } + SUPERLU_FREE (mem); + memAux -= (float) (12 * nprocs * iword); + SUPERLU_FREE(nvtcs); + memAux -= (float) (5 * nprocs * sizeof(int)); + + if (xlsub_s != NULL) { + SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); + } + if (xusub_s != NULL) { + SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); + } + SUPERLU_FREE (globToLoc); + if (supno_s != NULL) { + SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); + SUPERLU_FREE (supno_s); + } + + Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; + *p_xlsub = xlsub_n; *p_lsub = lsub_n; + *p_xusub = xusub_n; *p_usub = usub_n; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit dist_symbLU()"); +#endif + + return (-memRet); +} + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.  The lower part is
+ *   stored using a column format and the upper part
+ *   is stored using a row format.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_persist  (Input) Glu_persist_t *
+ *        Information on supernodes mapping.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * p_ainf_colptr (Output) int_t**
+ *         Pointer to the lower part of A distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_ainf_rowind (Output) int_t**
+ *         Structure of of the lower part of A distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_ainf_val    (Output) float**
+ *         Numerical values of the lower part of A, distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_asup_rowptr (Output) int_t**
+ *         Pointer to the upper part of A distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_asup_colind (Output) int_t**
+ *         Structure of of the upper part of A distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * p_asup_val    (Output) float**
+ *         Numerical values of the upper part of A, distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * ilsum_i  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block row wise).
+ *
+ * ilsum_j  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block column wise).
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated when out of memory.
+ *        (an approximation).
+ * 
+ */ + +static float +sdist_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + int_t **p_ainf_colptr, int_t **p_ainf_rowind, float **p_ainf_val, + int_t **p_asup_rowptr, int_t **p_asup_colind, float **p_asup_val, + int_t *ilsum_i, int_t *ilsum_j + ) +{ + int iam, p, procs; + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; + int_t nsupers, nsupers_i, nsupers_j; + int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; + float *asup_val, *ainf_val; + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ptr_to_send; + float *aij, **aij_send, *nzval, *dtemp; + float *nzval_a; + MPI_Request *send_req; + MPI_Status status; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + float memAux; /* Memory used during this routine and freed on return */ + float memRet; /* Memory allocated and not freed on return */ + int_t iword, dword, szbuf; + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sdist_A()"); +#endif + iword = sizeof(int_t); + dword = sizeof(double); + + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + if (!(nnzToRecv = intCalloc_dist(2*procs))) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (ERROR_RET); + } + memAux = (float) (2 * procs * iword); + memRet = 0.; + nnzToSend = nnzToRecv + procs; + nsupers = supno[n-1] + 1; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); + + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + } + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + szbuf = k; + + /* Allocate space for storing the triplets after redistribution. */ + if ( !(ia = intMalloc_dist(2*k)) ) { + fprintf (stderr, "Malloc fails for ia[]."); + return (memAux); + } + memAux += (float) (2*k*iword); + ja = ia + k; + if ( !(aij = floatMalloc_dist(k)) ) { + fprintf (stderr, "Malloc fails for aij[]."); + return (memAux); + } + memAux += (float) (k*dword); + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { + fprintf (stderr, "Malloc fails for send_req[]."); + return (memAux); + } + memAux += (float) (2*procs *sizeof(MPI_Request)); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for ia_send[]."); + return (memAux); + } + memAux += (float) (procs*sizeof(int_t*)); + if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for aij_send[]."); + return (memAux); + } + memAux += (float) (procs*sizeof(float*)); + if ( !(index = intMalloc_dist(2*SendCnt)) ) { + fprintf(stderr, "Malloc fails for index[]."); + return (memAux); + } + memAux += (float) (2*SendCnt*iword); + if ( !(nzval = floatMalloc_dist(SendCnt)) ) { + fprintf(stderr, "Malloc fails for nzval[]."); + return (memAux); + } + memAux += (float) (SendCnt * dword); + if ( !(ptr_to_send = intCalloc_dist(procs)) ) { + fprintf(stderr, "Malloc fails for ptr_to_send[]."); + return (memAux); + } + memAux += (float) (procs * iword); + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { + fprintf(stderr, "Malloc fails for itemp[]."); + return (memAux); + } + memAux += (float) (2*maxnnzToRecv*iword); + if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) ) { + fprintf(stderr, "Malloc fails for dtemp[]."); + return (memAux); + } + memAux += (float) (maxnnzToRecv * dword); + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { + fprintf (stderr, "Malloc fails for *ainf_colptr[]."); + return (memAux); + } + memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; + if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { + fprintf (stderr, "Malloc fails for *asup_rowptr[]."); + return (memAux+memRet); + } + memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nnz_loc_ainf = nnz_loc_asup = 0; + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + /* Count nonzeros in each column of L / row of U */ + if (gbi >= gbj) { + ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; + nnz_loc_ainf ++; + } + else { + asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; + nnz_loc_asup ++; + } + } + } + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + NOTE: Can possibly use MPI_Alltoallv. + ------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_FLOAT, + p, iam+procs, grid->comm, &send_req[procs+p] ); + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToRecv[p]; + MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + irow = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /* assert(jcol= gbj) { + ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; + nnz_loc_ainf ++; + } + else { + asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; + nnz_loc_asup ++; + } + } + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + MPI_Wait( &send_req[p], &status); + MPI_Wait( &send_req[procs+p], &status); + } + } + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE + ------------------------------------------------------------*/ + + SUPERLU_FREE(nnzToRecv); + memAux -= 2 * procs * iword; + if ( procs > 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + SUPERLU_FREE(ptr_to_send); + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + + procs*sizeof(float*) + 2*SendCnt * iword + + SendCnt* dword + procs*iword + + 2*maxnnzToRecv*iword + maxnnzToRecv*dword; + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT. + ------------------------------------------------------------*/ + if (nnz_loc_ainf != 0) { + if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { + fprintf (stderr, "Malloc fails for *ainf_rowind[]."); + return (memAux+memRet); + } + memRet += (float) (nnz_loc_ainf * iword); + if ( !(ainf_val = floatMalloc_dist(nnz_loc_ainf)) ) { + fprintf (stderr, "Malloc fails for *ainf_val[]."); + return (memAux+memRet); + } + memRet += (float) (nnz_loc_ainf * dword); + } + else { + ainf_rowind = NULL; + ainf_val = NULL; + } + if (nnz_loc_asup != 0) { + if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { + fprintf (stderr, "Malloc fails for *asup_colind[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc_asup * iword); + if ( !(asup_val = floatMalloc_dist(nnz_loc_asup)) ) { + fprintf (stderr, "Malloc fails for *asup_val[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc_asup * dword); + } + else { + asup_colind = NULL; + asup_val = NULL; + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = ainf_colptr[0]; ainf_colptr[0] = 0; + for (j = 1; j < ilsum_j[nsupers_j]; j++) { + k += jsize; + jsize = ainf_colptr[j]; + ainf_colptr[j] = k; + } + ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; + i = 0; + isize = asup_rowptr[0]; asup_rowptr[0] = 0; + for (j = 1; j < ilsum_i[nsupers_i]; j++) { + i += isize; + isize = asup_rowptr[j]; + asup_rowptr[j] = i; + } + asup_rowptr[ilsum_i[nsupers_i]] = i + isize; + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + jcol = ja[i]; + irow = ia[i]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + /* Count nonzeros in each column of L / row of U */ + if (gbi >= gbj) { + j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); + k = ainf_colptr[j]; + ainf_rowind[k] = irow; + ainf_val[k] = aij[i]; + ainf_colptr[j] ++; + } + else { + j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); + k = asup_rowptr[j]; + asup_colind[k] = jcol; + asup_val[k] = aij[i]; + asup_rowptr[j] ++; + } + } + + /* Reset the column pointers to the beginning of each column */ + for (j = ilsum_j[nsupers_j]; j > 0; j--) + ainf_colptr[j] = ainf_colptr[j-1]; + for (j = ilsum_i[nsupers_i]; j > 0; j--) + asup_rowptr[j] = asup_rowptr[j-1]; + ainf_colptr[0] = 0; + asup_rowptr[0] = 0; + + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); + memAux -= 2*szbuf*iword + szbuf*dword; + + *p_ainf_colptr = ainf_colptr; + *p_ainf_rowind = ainf_rowind; + *p_ainf_val = ainf_val; + *p_asup_rowptr = asup_rowptr; + *p_asup_colind = asup_colind; + *p_asup_val = asup_val; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sdist_A()"); + fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); +#endif + + return (-memRet); +} /* dist_A */ + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Distribute the input matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *          This routine should not be called for this case, an error
+ *          is generated.  Instead, pddistribute routine should be called.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (Input) int
+ *        Dimension of the matrix.
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (Input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * LUstruct (Input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated for performing the distribution
+ *       of the data, when out of memory.
+ *        (an approximation).
+ * 
+ */ + +float +sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + Pslu_freeable_t *Pslu_freeable, + sLUstruct_t *LUstruct, gridinfo_t *grid) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t Glu_freeable_n; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, + len, len1, nsupc, nsupc_gb, ii, nprocs; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + float *a; + int_t *asub, *xa; + int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; + float *asup_val, *ainf_val; + int_t *xsup, *supno; /* supernode and column mapping */ + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; + int_t next_ind; /* next available position in index[*] */ + int_t next_val; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *uval; /* nonzero values in L and U */ + int_t *recvBuf; + int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + float *lusup_srt; /* nonzero values in L and U */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + + + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in + the full array (local, block column wise) */ + /*-- Auxiliary arrays; freed on return --*/ + int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *LUb_length; /* L,U block length; size nsupers_ij */ + int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ + int_t *LUb_number; /* global block number; size nsupers_ij */ + int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ + int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; +float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + + float memStrLU, memA, + memDist = 0.; /* memory used for redistributing the data, which does + not include the memory for the numerical values + of L and U (positive number)*/ + float memNLU = 0.; /* memory allocated for storing the numerical values of + L and U, that will be used in the numeric + factorization (positive number) */ + float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); +#endif + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nprocs = grid->npcol * grid->nprow; + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + Astore = (NRformat_loc *) A->Store; + + iword = sizeof(int_t); + dword = sizeof(float); + + if (fact == SamePattern_SameRowPerm) { + ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); + } + + if ((memStrLU = + dist_symbLU (n, Pslu_freeable, + Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) + return (memStrLU); + memDist += (-memStrLU); + xsup = Glu_persist->xsup; /* supernode and column mapping */ + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ + nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ + nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); + if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { + fprintf (stderr, "Malloc fails for ilsum[]."); + return (memDist + memNLU + memTRS); + } + memNLU += (nsupers_i+1) * iword; + if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { + fprintf (stderr, "Malloc fails for ilsum_j[]."); + return (memDist + memNLU + memTRS); + } + memDist += (nsupers_j+1) * iword; + + /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ + ilsum[0] = 0; + ldaspa = 0; + for (gb = 0; gb < nsupers; gb++) + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + ilsum[nsupers_i] = ldaspa; + + ldaspa_j = 0; ilsum_j[0] = 0; + for (gb = 0; gb < nsupers; gb++) + if (mycol == PCOL( gb, grid )) { + i = SuperSize( gb ); + ldaspa_j += i; + lb = LBj( gb, grid ); + ilsum_j[lb + 1] = ilsum_j[lb] + i; + } + ilsum_j[nsupers_j] = ldaspa_j; + + if ((memA = sdist_A(A, ScalePermstruct, Glu_persist, + grid, &ainf_colptr, &ainf_rowind, &ainf_val, + &asup_rowptr, &asup_colind, &asup_val, + ilsum, ilsum_j)) > 0) + return (memDist + memA + memNLU + memTRS); + memDist += (-memA); + + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ + + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) { + fprintf(stderr, "Calloc fails for ToRecv[]."); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + memNLU += nsupers * iword; + + k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) { + fprintf(stderr, "Malloc fails for ToSendR[]."); + return (memDist + memNLU + memTRS); + } + memNLU += k*sizeof(int_t*); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) { + fprintf(stderr, "Malloc fails for index[]."); + return (memDist + memNLU + memTRS); + } + memNLU += j*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + + /* Auxiliary arrays used to set up L and U block data structures. + They are freed on return. */ + if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_length[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Malloc fails for LUb_indptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_number[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_valptr[]."); + return (memDist + memNLU + memTRS); + } + memDist += 4 * nsupers_ij * iword; + + k = CEILING( nsupers, grid->nprow ); + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(nsupers_i * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); + return (memDist + memNLU + memTRS); + } + memNLU += nsupers_i*sizeof(float*) + nsupers_i*sizeof(int_t*); + Unzval_br_ptr[nsupers_i-1] = NULL; + Ufstnz_br_ptr[nsupers_i-1] = NULL; + + if ( !(ToSendD = SUPERLU_MALLOC(nsupers_i * sizeof(int))) ) { + fprintf(stderr, "Malloc fails for ToSendD[]."); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO; + + memNLU += nsupers_i*iword; + if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { + fprintf(stderr, "Calloc fails for rb_marker[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { + fprintf(stderr, "Calloc fails for rb_marker[]."); + return (memDist + memNLU + memTRS); + } + memDist += (nsupers_i + nsupers_j)*iword; + + /* Auxiliary arrays used to set up L, U block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(dense = floatCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) + * sp_ienv_dist(3))) ) { + fprintf(stderr, "Calloc fails for SPA dense[]."); + return (memDist + memNLU + memTRS); + } + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(nsupers_i)) ) { + fprintf(stderr, "Calloc fails for fmod[]."); + return (memDist + memNLU + memTRS); + } + if ( !(bmod = intCalloc_dist(nsupers_i)) ) { + fprintf(stderr, "Calloc fails for bmod[]."); + return (memDist + memNLU + memTRS); + } + /* ------------------------------------------------ */ + memNLU += 2*nsupers_i*iword + + SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){ + fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + + if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){ + fprintf(stderr, "Malloc fails for Unnz[]."); + return (memDist + memNLU + memTRS); + } + memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr + + memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*); + Lnzval_bc_ptr[nsupers_j-1] = NULL; + Lrowind_bc_ptr[nsupers_j-1] = NULL; + Linv_bc_ptr[nsupers_j-1] = NULL; + Uinv_bc_ptr[nsupers_j-1] = NULL; + Lindval_loc_bc_ptr[nsupers_j-1] = NULL; + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for fsendx_plist[]."); + return (memDist + memNLU + memTRS); + } + len = nsupers_j * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) { + fprintf(stderr, "Malloc fails for fsendx_plist[0]"); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for bsendx_plist[]."); + return (memDist + memNLU + memTRS); + } + if ( !(index = intMalloc_dist(len)) ) { + fprintf(stderr, "Malloc fails for bsendx_plist[0]"); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ + memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + ljb_i = LBi( jb, grid); /* Local block number row wise */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + if ( myrow == jbrow ) { /* Block row jb in my process row */ + /* Scatter A into SPA. */ + for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { + for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { + if (i >= asup_rowptr[ilsum[nsupers_i]]) + printf ("ERR7\n"); + jcol = asup_colind[i]; + if (jcol >= n) + printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n", + iam, jb, gb, j, jcol); + gb = BlockNum( jcol ); + lb = LBj( gb, grid ); + if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); + jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); + if (jcol >= ldaspa_j) + printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n", + iam, jb, gb, j, jcol); + dense_col[jcol] = asup_val[i]; + } + dense_col += ldaspa_j; + } + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ + /* Count number of blocks and length of each block. */ + nrbu = 0; + len = 0; /* Number of column subscripts I own. */ + len1 = 0; /* number of fstnz subscripts */ + for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { + if (i >= xusub[nsupers_i]) printf ("ERR10\n"); + jcol = usub[i]; + gb = BlockNum( jcol ); /* Global block number */ + + /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) + printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", + iam, jb, gb, jcol, jbcol, pc); */ + + lb = LBj( gb, grid ); /* Local block number */ + pc = PCOL( gb, grid ); /* Process col owning this block */ + if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; + /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ + pr = PROW( gb, grid ); + if ( pr != jbrow && mycol == pc) + bsendx_plist[lb][jbrow] = YES; + if (mycol == pc) { + len += nsupc; + LUb_length[lb] += nsupc; + ToSendD[ljb_i] = YES; + if (Urb_marker[lb] <= jb) { /* First see this block */ + if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; + Urb_marker[lb] = jb + 1; + LUb_number[nrbu] = gb; + /* if (gb == 391825 && jb == 145361) + printf ("Pe[%d] T1 [%d %d] nrbu %d \n", + iam, jb, gb, nrbu); */ + nrbu ++; + len1 += SuperSize( gb ); + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[ljb_i];/* Mod. count for back solve */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + } + } /* for i ... */ + + if ( nrbu ) { + /* Sort the blocks of U in increasing block column index. + SuperLU_DIST assumes this is true */ + /* simple insert sort algorithm */ + /* to be transformed in quick sort */ + for (j = 1; j < nrbu; j++) { + k = LUb_number[j]; + for (i=j-1; i>=0 && LUb_number[i] > k; i--) { + LUb_number[i+1] = LUb_number[i]; + } + LUb_number[i+1] = k; + } + + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) { + fprintf (stderr, "Malloc fails for Uindex[]"); + return (memDist + memNLU + memTRS); + } + Ufstnz_br_ptr[ljb_i] = index; + if (!(Unzval_br_ptr[ljb_i] = + floatMalloc_dist(len))) { + fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); + return (memDist + memNLU + memTRS); + } + memNLU += (len1+1)*iword + len*dword; + uval = Unzval_br_ptr[ljb_i]; + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = nrbu; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index */ + index[len1] = -1; /* End marker */ + next_ind = BR_HEADER; + next_val = 0; + for (k = 0; k < nrbu; k++) { + gb = LUb_number[k]; + lb = LBj( gb, grid ); + len = LUb_length[lb]; + LUb_length[lb] = 0; /* Reset vector of block length */ + index[next_ind++] = gb; /* Descriptor */ + index[next_ind++] = len; + LUb_indptr[lb] = next_ind; + for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) + index[next_ind] = FstBlockC( jb + 1 ); + LUb_valptr[lb] = next_val; + next_val += len; + } + /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], + and the initial values of A from SPA into Unzval_br_ptr[]. */ + for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { + jcol = usub[i]; + gb = BlockNum( jcol ); + + if ( mycol == PCOL( gb, grid ) ) { + lb = LBj( gb, grid ); + k = LUb_indptr[lb]; /* Start fstnz in index */ + index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); + } + } /* for i ... */ + + for (i = 0; i < nrbu; i++) { + gb = LUb_number[i]; + lb = LBj( gb, grid ); + next_ind = LUb_indptr[lb]; + k = FstBlockC( jb + 1); + jcol = ilsum_j[lb]; + for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { + dense_col = dense; + j = index[next_ind+jj]; + for (ii = j; ii < k; ii++) { + uval[LUb_valptr[lb]++] = dense_col[jcol]; + dense_col[jcol] = zero; + dense_col += ldaspa_j; + } + } + } + } else { + Ufstnz_br_ptr[ljb_i] = NULL; + Unzval_br_ptr[ljb_i] = NULL; + } /* if nrbu ... */ + } /* if myrow == jbrow */ + + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + if (mycol == jbcol) { /* Block column jb in my process column */ + /* Scatter A_inf into SPA. */ + for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { + for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { + irow = ainf_rowind[i]; + if (irow >= n) printf ("Pe[%d] ERR1\n", iam); + gb = BlockNum( irow ); + if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); + dense_col[irow] = ainf_val[i]; + } + } + dense_col += ldaspa; + } + + /* sort the indices of the diagonal block at the beginning of xlsub */ + if (myrow == jbrow) { + k = xlsub[ljb_j]; + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + if (irow < nsupc + fsupc && i != k+irow-fsupc) { + lsub[i] = lsub[k + irow - fsupc]; + lsub[k + irow - fsupc] = irow; + i --; + } + } + } + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && + myrow == jbrow) { + fsendx_plist[ljb_j][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (Lrb_marker[lb] <= jb) { /* First see this block */ + Lrb_marker[lb] = jb + 1; + LUb_length[lb] = 1; + LUb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else + ++LUb_length[lb]; + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* If I am the owner of the diagonal block, order it first in LUb_number. + Necessary for SuperLU_DIST routines */ + kseen = EMPTY; + for (j = 0; j < nrbl; j++) { + if (LUb_number[j] == jb) + kseen = j; + } + if (kseen != EMPTY && kseen != 0) { + LUb_number[kseen] = LUb_number[0]; + LUb_number[0] = jb; + } + + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) { + fprintf (stderr, "Malloc fails for index[]"); + return (memDist + memNLU + memTRS); + } + Lrowind_bc_ptr[ljb_j] = index; + if (!(Lnzval_bc_ptr[ljb_j] = + floatMalloc_dist(len*nsupc))) { + fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb); + return (memDist + memNLU + memTRS); + } + + if (!(Linv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]"); + if (!(Uinv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]"); + + memNLU += len1*iword + len*nsupc*dword; + + if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3))) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]"); + memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb] + + lusup = Lnzval_bc_ptr[ljb_j]; + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_ind = BC_HEADER; + next_val = 0; + for (k = 0; k < nrbl; ++k) { + gb = LUb_number[k]; + lb = LBi( gb, grid ); + len = LUb_length[lb]; + + Lindval_loc_bc_ptr[ljb_j][k] = lb; + Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind; + Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val; + + LUb_length[lb] = 0; + index[next_ind++] = gb; /* Descriptor */ + index[next_ind++] = len; + LUb_indptr[lb] = next_ind; + LUb_valptr[lb] = next_val; + next_ind += len; + next_val += len; + } + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = LUb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = LUb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + + + /* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb_j][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb_j]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jjnprow, grid->npcol); + if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { + fprintf (stderr, "Malloc fails for recvBuf[]."); + return (memDist + memNLU + memTRS); + } + if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for ptrToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for ptrToRecv[]."); + return (memDist + memNLU + memTRS); + } + + if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) + memDist = nsupers*k*iword +4*nprocs * sizeof(int); + + for (p = 0; p < nprocs; p++) + nnzToRecv[p] = 0; + + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + nnzToRecv[p] += grid->npcol; + } + i = 0; + for (p = 0; p < nprocs; p++) { + ptrToRecv[p] = i; + i += nnzToRecv[p]; + ptrToSend[p] = 0; + if (p != iam) + nnzToSend[p] = nnzToRecv[iam]; + else + nnzToSend[p] = 0; + } + nnzToRecv[iam] = 0; + i = ptrToRecv[iam]; + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + if (p == iam) { + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + for (j = 0; j < grid->npcol; j++, i++) + recvBuf[i] = ToSendR[ljb_j][j]; + } + } + + MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, + recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); + + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + ljb_i = LBi( jb, grid ); /* Local block number row wise */ + /* (myrow == jbrow) { + if (ToSendD[ljb_i] == YES) + ToRecv[jb] = 1; + } + else { + if (recvBuf[ptrToRecv[p] + mycol] == YES) + ToRecv[jb] = 2; + } */ + if (recvBuf[ptrToRecv[p] + mycol] == YES) { + if (myrow == jbrow) + ToRecv[jb] = 1; + else + ToRecv[jb] = 2; + } + if (mycol == jbcol) { + for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) + ToSendR[ljb_j][i] = recvBuf[j]; + ToSendR[ljb_j][mycol] = EMPTY; + } + ptrToRecv[p] += grid->npcol; + } + + /* exchange information about bsendx_plist in between column of processors */ + MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, + MPI_MAX, grid->cscp.comm); + + for (jb = 0; jb < nsupers; jb ++) { + jbcol = PCOL( jb, grid); + jbrow = PROW( jb, grid); + if (mycol == jbcol) { + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + if (myrow == jbrow ) { + for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { + (*bsendx_plist)[k] = recvBuf[k]; + if ((*bsendx_plist)[k] != EMPTY) + nbsendx ++; + } + } + else { + for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) + (*bsendx_plist)[k] = EMPTY; + } + } + } + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + + // if(LSUM=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll + + + for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + + #if ( PRNTlevel>=1 ) + if(Root==mycol){ + assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + //////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll + + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + + // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); + // fflush(stdout); + + if(ibnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + } + } /* for i ... */ + pr = PROW( ib, grid ); // take care of diagonal node stored as L + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ljb = LBj( ib, grid ); /* local block number */ + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); + // fflush(stdout); + } + } + } + + // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); + // fflush(stdout); + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); + fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + memTRS -= k*dword + grid->nprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + if(ibnpcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } /* for i ... */ + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + /* Free the memory used for storing L and U */ + SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); + if (lsub != NULL) + SUPERLU_FREE(lsub); + if (usub != NULL) + SUPERLU_FREE(usub); + + + SUPERLU_FREE(nnzToRecv); + SUPERLU_FREE(ptrToRecv); + SUPERLU_FREE(nnzToSend); + SUPERLU_FREE(ptrToSend); + SUPERLU_FREE(recvBuf); + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + LUstruct->Glu_persist = Glu_persist; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist, + ToRecv, ToSendR, ToSendD, mod_bit + */ + CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); +#endif + + return (- (memDist+memNLU)); +} /* sdist_psymbtonum */ + diff --git a/SRC/psutil.c b/SRC/psutil.c new file mode 100644 index 00000000..a3a384a4 --- /dev/null +++ b/SRC/psutil.c @@ -0,0 +1,896 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Several matrix utilities + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief Gather A from the distributed compressed row format to global A in compressed column format. + */ +int psCompRow_loc_to_CompCol_global +( + int_t need_value, /* Input. Whether need to gather numerical values */ + SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ + gridinfo_t *grid, /* Input */ + SuperMatrix *GA /* Output */ +) +{ + NRformat_loc *Astore; + NCformat *GAstore; + float *a, *a_loc; + int_t *colind, *rowptr; + int_t *colptr_loc, *rowind_loc; + int_t m_loc, n, i, j, k, l; + int_t colnnz, fst_row, nnz_loc, nnz; + float *a_recv; /* Buffer to receive the blocks of values. */ + float *a_buf; /* Buffer to merge blocks into block columns. */ + int_t *itemp; + int_t *colptr_send; /* Buffer to redistribute the column pointers of the + local block rows. + Use n_loc+1 pointers for each block. */ + int_t *colptr_blk; /* The column pointers for each block, after + redistribution to the local block columns. + Use n_loc+1 pointers for each block. */ + int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ + int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ + int_t *fst_rows, *n_locs; + int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; + int it, n_loc, procs; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psCompRow_loc_to_CompCol_global"); +#endif + + /* Initialization. */ + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ + + /* ------------------------------------------------------------ + FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. + ------------------------------------------------------------*/ + sCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, + &rowind_loc, &colptr_loc); + /* Change local row index numbers to global numbers. */ + for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; + +#if ( DEBUGlevel>=2 ) + printf("Proc %d\n", grid->iam); + PrintInt10("rowind_loc", nnz_loc, rowind_loc); + PrintInt10("colptr_loc", n+1, colptr_loc); +#endif + + procs = grid->nprow * grid->npcol; + if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) + ABORT("Malloc fails for fst_rows[]"); + n_locs = fst_rows + procs; + MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, + grid->comm); + for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; + n_locs[procs-1] = n - fst_rows[procs-1]; + if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) + ABORT("Malloc fails for recvcnts[]"); + sendcnts = recvcnts + procs; + rdispls = sendcnts + procs; + sdispls = rdispls + procs; + itemp_32 = sdispls + procs; + + /* All-to-all transfer column pointers of each block. + Now the matrix view is P-by-P block-partition. */ + /* n column starts for each column, and procs column ends for each block */ + if ( !(colptr_send = intMalloc_dist(n + procs)) ) + ABORT("Malloc fails for colptr_send[]"); + if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) + ABORT("Malloc fails for colptr_blk[]"); + for (i = 0, j = 0; i < procs; ++i) { + for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; + colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ + sendcnts[i] = n_locs[i] + 1; +#if ( DEBUGlevel>=1 ) + assert(j == fst_rows[i]); +#endif + sdispls[i] = j + i; + recvcnts[i] = n_loc + 1; + rdispls[i] = i * (n_loc + 1); + j += n_locs[i]; /* First column of next block in colptr_loc[] */ + } + MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, + colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); + + /* Adjust colptr_blk[] so that they contain the local indices of the + column pointers in the receive buffer. */ + nnz = 0; /* The running sum of the nonzeros counted by far */ + k = 0; + for (i = 0; i < procs; ++i) { + for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { + colnnz = colptr_blk[j+1] - colptr_blk[j]; + /*assert(k<=j);*/ + colptr_blk[k] = nnz; + nnz += colnnz; /* Start of the next column */ + ++k; + } + colptr_blk[k++] = nnz; /* Add an END marker for each block */ + } + /*assert(k == (n_loc+1)*procs);*/ + + /* Now prepare to transfer row indices and values. */ + sdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; + sdispls[i+1] = sdispls[i] + sendcnts[i]; + } + sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; + for (i = 0; i < procs; ++i) { + j = rdispls[i]; /* Point to this block in colptr_blk[]. */ + recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; + } + rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ + for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; + + k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ + if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) + ABORT("Malloc fails for rowind_recv[]"); + rowind_buf = rowind_recv + k; + MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, + rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(a_recv = (float *) floatMalloc_dist(2*k)) ) + ABORT("Malloc fails for rowind_recv[]"); + a_buf = a_recv + k; + MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_FLOAT, + a_recv, recvcnts, rdispls, MPI_FLOAT, + grid->comm); + } + + /* Reset colptr_loc[] to point to the n_loc global columns. */ + colptr_loc[0] = 0; + itemp = colptr_send; + for (j = 0; j < n_loc; ++j) { + colnnz = 0; + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1) + j; /* j-th column in i-th block */ + colnnz += colptr_blk[k+1] - colptr_blk[k]; + } + colptr_loc[j+1] = colptr_loc[j] + colnnz; + itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ + } + itemp[n_loc] = colptr_loc[n_loc]; + + /* Merge blocks of row indices into columns of row indices. */ + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + rowind_buf[itemp[j]] = rowind_recv[l]; + ++itemp[j]; + } + } + } + + if ( need_value ) { + for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + a_buf[itemp[j]] = a_recv[l]; + ++itemp[j]; + } + } + } + } + + /* ------------------------------------------------------------ + SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. + ------------------------------------------------------------*/ + GA->nrow = A->nrow; + GA->ncol = A->ncol; + GA->Stype = SLU_NC; + GA->Dtype = A->Dtype; + GA->Mtype = A->Mtype; + GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); + if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); + + /* First gather the size of each piece. */ + nnz_loc = colptr_loc[n_loc]; + MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); + for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; + GAstore->nnz = nnz; + + if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); + if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); + + /* Allgatherv for row indices. */ + rdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + itemp[i]; + itemp_32[i] = itemp[i]; + } + itemp_32[procs-1] = itemp[procs-1]; + it = nnz_loc; + MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, + itemp_32, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(GAstore->nzval = (float *) floatMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); + MPI_Allgatherv(a_buf, it, MPI_FLOAT, GAstore->nzval, + itemp_32, rdispls, MPI_FLOAT, grid->comm); + } else GAstore->nzval = NULL; + + /* Now gather the column pointers. */ + rdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + n_locs[i]; + itemp_32[i] = n_locs[i]; + } + itemp_32[procs-1] = n_locs[procs-1]; + MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, + itemp_32, rdispls, mpi_int_t, grid->comm); + + /* Recompute column pointers. */ + for (i = 1; i < procs; ++i) { + k = rdispls[i]; + for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; + itemp[i] += itemp[i-1]; /* prefix sum */ + } + GAstore->colptr[n] = nnz; + +#if ( DEBUGlevel>=2 ) + if ( !grid->iam ) { + printf("After pdCompRow_loc_to_CompCol_global()\n"); + sPrint_CompCol_Matrix_dist(GA); + } +#endif + + SUPERLU_FREE(a_loc); + SUPERLU_FREE(rowind_loc); + SUPERLU_FREE(colptr_loc); + SUPERLU_FREE(fst_rows); + SUPERLU_FREE(recvcnts); + SUPERLU_FREE(colptr_send); + SUPERLU_FREE(colptr_blk); + SUPERLU_FREE(rowind_recv); + if ( need_value) SUPERLU_FREE(a_recv); +#if ( DEBUGlevel>=1 ) + if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat)); + CHECK_MALLOC(grid->iam, "Exit psCompRow_loc_to_CompCol_global"); +#endif + return 0; +} /* psCompRow_loc_to_CompCol_global */ + + +/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B. + */ +int psPermute_Dense_Matrix +( + int_t fst_row, + int_t m_loc, + int_t row_to_proc[], + int_t perm[], + float X[], int ldx, + float B[], int ldb, + int nrhs, + gridinfo_t *grid +) +{ + int_t i, j, k, l; + int p, procs; + int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psPermute_Dense_Matrix()"); +#endif + + procs = grid->nprow * grid->npcol; + if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) + ABORT("Malloc fails for sendcnts[]."); + sendcnts_nrhs = sendcnts + procs; + recvcnts = sendcnts_nrhs + procs; + recvcnts_nrhs = recvcnts + procs; + sdispls = recvcnts_nrhs + procs; + sdispls_nrhs = sdispls + procs; + rdispls = sdispls_nrhs + procs; + rdispls_nrhs = rdispls + procs; + ptr_to_ibuf = rdispls_nrhs + procs; + ptr_to_dbuf = ptr_to_ibuf + procs; + + for (i = 0; i < procs; ++i) sendcnts[i] = 0; + + /* Count the number of X entries to be sent to each process.*/ + for (i = fst_row; i < fst_row + m_loc; ++i) { + p = row_to_proc[perm[i]]; + ++sendcnts[p]; + } + MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + sendcnts_nrhs[0] = sendcnts[0] * nrhs; + recvcnts_nrhs[0] = recvcnts[0] * nrhs; + for (i = 1; i < procs; ++i) { + sdispls[i] = sdispls[i-1] + sendcnts[i-1]; + sdispls_nrhs[i] = sdispls[i] * nrhs; + rdispls[i] = rdispls[i-1] + recvcnts[i-1]; + rdispls_nrhs[i] = rdispls[i] * nrhs; + sendcnts_nrhs[i] = sendcnts[i] * nrhs; + recvcnts_nrhs[i] = recvcnts[i] * nrhs; + } + k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ + l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ + /*assert(k == m_loc);*/ + /*assert(l == m_loc);*/ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; + + for (i = 0; i < procs; ++i) { + ptr_to_ibuf[i] = sdispls[i]; + ptr_to_dbuf[i] = sdispls_nrhs[i]; + } + + /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ + for (i = fst_row; i < fst_row + m_loc; ++i) { + j = perm[i]; + p = row_to_proc[j]; + send_ibuf[ptr_to_ibuf[p]] = j; + j = ptr_to_dbuf[p]; + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + send_dbuf[j++] = X[i-fst_row + k*ldx]; + } + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; + } + + /* Transfer the (permuted) row indices and numerical values. */ + MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, + recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + + /* Copy the buffer into b. */ + for (i = 0, l = 0; i < m_loc; ++i) { + j = recv_ibuf[i] - fst_row; /* Relative row number */ + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + B[j + k*ldb] = recv_dbuf[l++]; + } + } + + SUPERLU_FREE(sendcnts); + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psPermute_Dense_Matrix()"); +#endif + return 0; +} /* psPermute_Dense_Matrix */ + + +/*! \brief Allocate storage in LUstruct */ +void sLUstructInit(const int_t n, sLUstruct_t *LUstruct) +{ + if ( !(LUstruct->etree = intMalloc_dist(n)) ) + ABORT("Malloc fails for etree[]."); + if ( !(LUstruct->Glu_persist = (Glu_persist_t *) + SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) + ABORT("Malloc fails for Glu_persist_t."); + if ( !(LUstruct->Llu = (sLocalLU_t *) + SUPERLU_MALLOC(sizeof(sLocalLU_t))) ) + ABORT("Malloc fails for LocalLU_t."); + LUstruct->Llu->inv = 0; +} + +/*! \brief Deallocate LUstruct */ +void sLUstructFree(sLUstruct_t *LUstruct) +{ +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter sLUstructFree()"); +#endif + + SUPERLU_FREE(LUstruct->etree); + SUPERLU_FREE(LUstruct->Glu_persist); + SUPERLU_FREE(LUstruct->Llu); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sLUstructFree()"); +#endif +} + +/*! \brief Destroy distributed L & U matrices. */ +void +sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter sDestroy_LU()"); +#endif + + sDestroy_Tree(n, grid, LUstruct); + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Lrowind_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); +#if 0 // Sherry: the following is not allocated with cudaHostAlloc + //#ifdef GPU_ACC + checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i])); +#endif + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + SUPERLU_FREE (Llu->Lnzval_bc_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i = 0; i < nb; ++i) + if ( Llu->Ufstnz_br_ptr[i] ) { + SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE (Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE (Llu->Ufstnz_br_ptr); + SUPERLU_FREE (Llu->Unzval_br_ptr); + + /* The following can be freed after factorization. */ + SUPERLU_FREE(Llu->ToRecv); + SUPERLU_FREE(Llu->ToSendD); + SUPERLU_FREE(Llu->ToSendR[0]); + SUPERLU_FREE(Llu->ToSendR); + + /* The following can be freed only after iterative refinement. */ + SUPERLU_FREE(Llu->ilsum); + SUPERLU_FREE(Llu->fmod); + SUPERLU_FREE(Llu->fsendx_plist[0]); + SUPERLU_FREE(Llu->fsendx_plist); + SUPERLU_FREE(Llu->bmod); + SUPERLU_FREE(Llu->bsendx_plist[0]); + SUPERLU_FREE(Llu->bsendx_plist); + SUPERLU_FREE(Llu->mod_bit); + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) { + SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]); + } + SUPERLU_FREE(Llu->Lindval_loc_bc_ptr); + + nb = CEILING(nsupers, grid->npcol); + for (i=0; iLinv_bc_ptr[i]!=NULL) { + SUPERLU_FREE(Llu->Linv_bc_ptr[i]); + } + if(Llu->Uinv_bc_ptr[i]!=NULL){ + SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); + } + } + SUPERLU_FREE(Llu->Linv_bc_ptr); + SUPERLU_FREE(Llu->Uinv_bc_ptr); + SUPERLU_FREE(Llu->Unnz); + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Urbs[i] ) { + SUPERLU_FREE(Llu->Ucb_indptr[i]); + SUPERLU_FREE(Llu->Ucb_valptr[i]); + } + SUPERLU_FREE(Llu->Ucb_indptr); + SUPERLU_FREE(Llu->Ucb_valptr); + SUPERLU_FREE(Llu->Urbs); + + SUPERLU_FREE(Glu_persist->xsup); + SUPERLU_FREE(Glu_persist->supno); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sDestroy_LU()"); +#endif +} + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * 
+ */ +int_t +psgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, + int_t perm_r[], int_t perm_c[], gridinfo_t *grid, + Glu_persist_t *Glu_persist, sSOLVEstruct_t *SOLVEstruct) +{ + + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; + int_t *row_to_proc; + int_t i, gbi, k, l, num_diag_procs, *diag_procs; + int_t irow, q, knsupc, nsupers, *xsup, *supno; + int iam, p, pkk, procs; + pxgstrs_comm_t *gstrs_comm; + + procs = grid->nprow * grid->npcol; + iam = grid->iam; + gstrs_comm = SOLVEstruct->gstrs_comm; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + row_to_proc = SOLVEstruct->row_to_proc; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. + ------------------------------------------------------------*/ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for B_to_X_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of elements to be sent to each diagonal process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + ++SendCnt[p]; + } + + /* Set up the displacements for alltoall. */ + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + } + for (p = 0; p < procs; ++p) { + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + sdispls_nrhs[p] = sdispls[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->B_to_X_SendCnt = SendCnt; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. + ------------------------------------------------------------*/ + /* This is freed in pxgstrs_finalize(). */ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for X_to_B_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of X entries to be sent to each process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + irow = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) { +#if 0 + q = row_to_proc[inv_perm_c[irow]]; +#else + q = row_to_proc[irow]; +#endif + ++SendCnt[q]; + ++irow; + } + } + } + } + + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + SendCnt_nrhs[0] = SendCnt[0] * nrhs; + RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + sdispls_nrhs[p] = sdispls[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->X_to_B_SendCnt = SendCnt; + + if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for ptr_to_ibuf[]."); + gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; + gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; + + return 0; +} /* PSGSTRS_INIT */ + + +/*! \brief Initialize the data structure for the solution phase. + */ +int sSolveInit(superlu_dist_options_t *options, SuperMatrix *A, + int_t perm_r[], int_t perm_c[], int_t nrhs, + sLUstruct_t *LUstruct, gridinfo_t *grid, + sSOLVEstruct_t *SOLVEstruct) +{ + int_t *row_to_proc, *inv_perm_c, *itemp; + NRformat_loc *Astore; + int_t i, fst_row, m_loc, p; + int procs; + + Astore = (NRformat_loc *) A->Store; + fst_row = Astore->fst_row; + m_loc = Astore->m_loc; + procs = grid->nprow * grid->npcol; + + if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) + ABORT("Malloc fails for row_to_proc[]"); + SOLVEstruct->row_to_proc = row_to_proc; + if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) + ABORT("Malloc fails for inv_perm_c[]."); + for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; + SOLVEstruct->inv_perm_c = inv_perm_c; + + /* ------------------------------------------------------------ + EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. + SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. + + NOTE: For those processes that do not own any row, it must + must be set so that fst_row == A->nrow. + ------------------------------------------------------------*/ + if ( !(itemp = intMalloc_dist(procs+1)) ) + ABORT("Malloc fails for itemp[]"); + MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, + grid->comm); + itemp[procs] = A->nrow; + for (p = 0; p < procs; ++p) { + for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; + } +#if ( DEBUGlevel>=2 ) + if ( !grid->iam ) { + printf("fst_row = %d\n", fst_row); + PrintInt10("row_to_proc", A->nrow, row_to_proc); + PrintInt10("inv_perm_c", A->ncol, inv_perm_c); + } +#endif + SUPERLU_FREE(itemp); + +#if 0 + /* Compute the mapping between rows and processes. */ + /* XSL NOTE: What happens if # of mapped processes is smaller + than total Procs? For the processes without any row, let + fst_row be EMPTY (-1). Make sure this case works! */ + MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, + grid->comm); + itemp[procs] = n; + for (p = 0; p < procs; ++p) { + j = itemp[p]; + if ( j != EMPTY ) { + k = itemp[p+1]; + if ( k == EMPTY ) k = n; + for (i = j ; i < k; ++i) row_to_proc[i] = p; + } + } +#endif + + get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, + &SOLVEstruct->num_diag_procs, + &SOLVEstruct->diag_procs, + &SOLVEstruct->diag_len); + + /* Setup communication pattern for redistribution of B and X. */ + if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + ABORT("Malloc fails for gstrs_comm[]"); + psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct); + + if ( !(SOLVEstruct->gsmv_comm = (psgsmv_comm_t *) + SUPERLU_MALLOC(sizeof(psgsmv_comm_t))) ) + ABORT("Malloc fails for gsmv_comm[]"); + SOLVEstruct->A_colind_gsmv = NULL; + + options->SolveInitialized = YES; + return 0; +} /* sSolveInit */ + +/*! \brief Release the resources used for the solution phase. + */ +void sSolveFinalize(superlu_dist_options_t *options, sSOLVEstruct_t *SOLVEstruct) +{ + if ( options->SolveInitialized ) { + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + + if ( options->RefineInitialized ) { + psgsmv_finalize(SOLVEstruct->gsmv_comm); + options->RefineInitialized = NO; + } + SUPERLU_FREE(SOLVEstruct->gsmv_comm); + SUPERLU_FREE(SOLVEstruct->row_to_proc); + SUPERLU_FREE(SOLVEstruct->inv_perm_c); + SUPERLU_FREE(SOLVEstruct->diag_procs); + SUPERLU_FREE(SOLVEstruct->diag_len); + if ( SOLVEstruct->A_colind_gsmv ) + SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); + options->SolveInitialized = NO; + } +} /* sSolveFinalize */ + +void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d) +{ + /* free A2d and B2d, which are allocated only in 2D layer grid-0 */ + NRformat_loc3d *A3d = SOLVEstruct->A3d; + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE( A2d->rowptr ); + SUPERLU_FREE( A2d->colind ); + SUPERLU_FREE( A2d->nzval ); + } + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts + SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); + SUPERLU_FREE(A3d->procs_to_send_list); + SUPERLU_FREE(A3d->send_count_list); + SUPERLU_FREE(A3d->procs_recv_from_list); + SUPERLU_FREE(A3d->recv_count_list); + SUPERLU_FREE( A2d ); // free 2D structure + SUPERLU_FREE( A3d ); // free 3D structure +} /* sDestroy_A3d_gathered_on_2d */ + + +/*! \brief Check the inf-norm of the error vector + */ +void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx, + float xtrue[], int_t ldxtrue, MPI_Comm slucomm) +{ + float err, xnorm, temperr, tempxnorm; + float *x_work, *xtrue_work; + int i, j; + + for (j = 0; j < nrhs; j++) { + x_work = &x[j*ldx]; + xtrue_work = &xtrue[j*ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + + /* get the golbal max err & xnrom */ + temperr = err; + tempxnorm = xnorm; + MPI_Allreduce( &temperr, &err, 1, MPI_FLOAT, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_FLOAT, MPI_MAX, slucomm); + + err = err / xnorm; + if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + } +} + +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ +void +sDestroy_Tree(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter Destroy_Tree()"); +#endif + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sDestroy_Tree()"); +#endif +} + + diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c index 597c364a..fe2b722c 100644 --- a/SRC/psymbfact.c +++ b/SRC/psymbfact.c @@ -353,8 +353,8 @@ float symbfact_dist #endif /* Allocate storage common to the symbolic factor routines */ - if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, - &Llu_symbfact, &VInfo, &CS, &PS)) + if ((iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, + &Llu_symbfact, &VInfo, &CS, &PS))) return (PS.allocMem); /* Copy the redistributed input matrix AS at the end of the memory buffer allocated to store L and U. That is, copy (AS.x_ainf, AS.ind_ainf) in @@ -2067,15 +2067,15 @@ symbfact_vtx /* TEST available memory */ if (next >= x_aind_end) { if (domain_symb) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, 0, - computeL, DOMAIN_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - } else if (mem_error = - psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, - computeL, LL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtx, next, 0, + computeL, DOMAIN_SYMB, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); + } else if ( (mem_error = + psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, + computeL, LL_SYMB, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); x_aind_end = xsub[vtx_lid + 1]; @@ -2246,9 +2246,9 @@ updateRcvd_prGraph /* test if enough memory in usubPr array */ if (ind >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) ) + return (mem_error); if (computeL) subPr = Llu_symbfact->lsubPr; else @@ -2367,9 +2367,9 @@ update_prGraph if (sn_elt < lstVtx_blk) { sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset; if ((*p_indsubPr) + 2 >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) ) + return (mem_error); if (computeL) { subPr = Llu_symbfact->lsubPr; szsubPr = Llu_symbfact->szLsubPr; } @@ -2523,16 +2523,16 @@ blk_symbfact prval_curvtx = n; /* Compute nonzero structure L(:,vtx) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextl, - marker, - lsub_rcvd, lsub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, - &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, - &neltsMatched_L, markl1_vtx, &prval_curvtx, - vtx_bel_snU, &vtx_bel_snL)) + if ( (mem_error = + symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, + fstVtx_blk, lstVtx, + snrep_lid, szsn, &nextl, + marker, + lsub_rcvd, lsub_rcvd_sz, + Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, + &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, + &neltsMatched_L, markl1_vtx, &prval_curvtx, + vtx_bel_snU, &vtx_bel_snL)) ) return (mem_error); lsub = Llu_symbfact->lsub; @@ -2541,17 +2541,17 @@ blk_symbfact #endif /* Compute nonzero structure of U(vtx,:) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextu, - marker, - usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, - &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, - &neltsMatched_U, marku1_vtx, &prval_curvtx, - vtx_bel_snL, &vtx_bel_snU)) - return (mem_error); + if ( (mem_error = + symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, + fstVtx_blk, lstVtx, + snrep_lid, szsn, &nextu, + marker, + usub_rcvd, usub_rcvd_sz, + Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, + &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, + &neltsMatched_U, marku1_vtx, &prval_curvtx, + vtx_bel_snL, &vtx_bel_snU)) ) + return (mem_error); usub = Llu_symbfact->usub; #ifdef TEST_SYMB @@ -2618,11 +2618,11 @@ blk_symbfact *p_nextu = xusub[vtx_lid]; nsuper_loc += 1; *p_nsuper_loc = nsuper_loc; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, vtx, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, + sizes, fstVtxSep, vtx, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); /* set up neltsZr and neltsTotal */ vtx = lstVtx_blk; @@ -3157,7 +3157,8 @@ expand_RL if (!computeL) marker[vtx] = markl; - for (ii; ii < mpnelts; ii++) { + //for (ii; ii < mpnelts; ii++) { // Sherry: compiler warning + for (; ii < mpnelts; ii++) { elt = lsub_rcvd[ii]; if (elt >= vtx) { if (marker[elt] != markl) { @@ -3201,10 +3202,10 @@ expand_RL } nextl = xlsub[vtxXp_lid+1]; - if (mem_error = - psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, - computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, + computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); return 0; } @@ -3332,9 +3333,9 @@ rl_update /* test if enough memory in usubPr array */ if (ind > Llu_symbfact->szLsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) ) + return (mem_error); usubPr = Llu_symbfact->lsubPr; } @@ -3456,19 +3457,20 @@ rl_update if (!computeL) marker[vtx] = markl; PS->nops += mpnelts - ii; - for (ii; ii < mpnelts; ii++) { + //for (ii; ii < mpnelts; ii++) { // Sherry: compiler warning + for (; ii < mpnelts; ii++) { elt = lsub_rcvd[ii]; if (elt >= vtx) { if (marker[elt] != markl) { /* add elt to structure of vtx */ if (nextl >= xlsub[vtx_lid + 1]) { - if (mem_error = - expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, vtx, i, - lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, - fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, - &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); + if ( (mem_error = + expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, + usub_rcvd, usub_rcvd_sz, vtx, i, + lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, + fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, + &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); if (computeL) { lsub = Llu_symbfact->lsub; if (!computeRcvd) @@ -3565,20 +3567,20 @@ dnsUpSeps_symbfact else vtx_elt = fstVtx_lvl; if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, - nextl + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, + nextl + fstVtx_lvl - vtx_elt, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); lsub = Llu_symbfact->lsub; } if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, - nextu + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, + nextu + fstVtx_lvl - vtx_elt, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); usub = Llu_symbfact->usub; } @@ -3610,10 +3612,10 @@ dnsUpSeps_symbfact if (lsub[k] >= fstVtx_blk) { lsub[nextl] = lsub[k]; nextl ++; if (nextl >= MEM_LSUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); lsub = Llu_symbfact->lsub; } @@ -3621,10 +3623,10 @@ dnsUpSeps_symbfact if (usub[k] > fstVtx_blk) { usub[nextu] = usub[k]; nextu ++; if (nextu >= MEM_USUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, - USUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, + USUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); usub = Llu_symbfact->usub; } @@ -3892,10 +3894,10 @@ dnsCurSep_symbfact j = x_newelts[vtx_lid_x+1] + lstVtx - vtx; if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) || (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, next + j, - computeL, DNS_CURSEP, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtx, next, next + j, + computeL, DNS_CURSEP, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); if (computeL) sub = Llu_symbfact->lsub; else sub = Llu_symbfact->usub; @@ -4131,19 +4133,19 @@ denseSep_symbfact } if (VInfo->filledSep == FILLED_SEP) { - if (mem_error = - dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, lstP - fstP, rcvd_dnsSep, p_nextl, - p_nextu, p_mark, p_nsuper_loc, marker, ndCom, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) + if ( (mem_error = + dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, + szSep, lstP - fstP, rcvd_dnsSep, p_nextl, + p_nextu, p_mark, p_nsuper_loc, marker, ndCom, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) ) return (mem_error); } else if (rcvd_dnsSep) - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, EMPTY, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, + sizes, fstVtxSep, EMPTY, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); return 0; } @@ -4251,11 +4253,11 @@ interLvl_symbfact /* quick return if all upper separators are dense */ if (VInfo->filledSep != FILLED_SEPS) { VInfo->filledSep = FILLED_SEPS; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, - fstVtxSep, - EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, + fstVtxSep, + EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); } return 0; @@ -4774,7 +4776,11 @@ intraLvl_symbfact MPI_Irecv (&sz_msg, 1, mpi_int_t, MPI_ANY_SOURCE, tag_intraLvl_szMsg, (*symb_comm), &(request[0])); +#if defined (_LONGINT) if (sz_msg > LONG_MAX) +#else + if (sz_msg > INT_MAX) +#endif ABORT("ERROR in intraLvl_symbfact size to send > LONG_MAX\n"); } MPI_Waitany (2, request, index_req, status); diff --git a/SRC/psymbfact_util.c b/SRC/psymbfact_util.c index 40b9c864..39c066ae 100644 --- a/SRC/psymbfact_util.c +++ b/SRC/psymbfact_util.c @@ -292,10 +292,10 @@ int_t psymbfact_LUXpand if (prev_len + len_texp >= prev_xsub_nextLvl) { /* not enough memory */ min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, + mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact, + VInfo, PS)) ) return (mem_error); if ( mem_type == LSUB ) new_mem = Llu_symbfact->lsub; @@ -437,10 +437,10 @@ int_t psymbfact_LUXpand_RL if (prev_len + len_texp >= prev_xsub_nextLvl) { /* not enough memory */ min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, + mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact, + VInfo, PS)) ) return (mem_error); if ( mem_type == LSUB ) new_mem = Llu_symbfact->lsub; diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c new file mode 100644 index 00000000..a3bbe4d9 --- /dev/null +++ b/SRC/pz3dcomm.c @@ -0,0 +1,875 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Communication routines for the 3D algorithm. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_zdefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(doublecomplex), MPI_INFO_NULL, &(a)))
+
+int_t zAllocLlu(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    doublecomplex  **Lnzval_bc_ptr =
+	(doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    doublecomplex  **Unzval_br_ptr =
+	(doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* zAllocLlu */
+
+int_t zmpiMallocLUStruct(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    doublecomplex * uval = Unzval_br_ptr[lb];
+	    doublecomplex * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(doublecomplex));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    doublecomplex *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(doublecomplex));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t zzSendLPanel(int_t k, int_t receiver,
+                   zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(doublecomplex);
+		}
+	}
+    return 0;
+}
+
+
+int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
+                    doublecomplex* Lval_buf,
+                    zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, SuperLU_MPI_DOUBLE_COMPLEX, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_zscal(len2, alpha, lnzval, 1);
+		    superlu_zaxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t zzSendUPanel(int_t k, int_t receiver,
+                    zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    doublecomplex* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(doublecomplex);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
+                    doublecomplex* Uval_buf, zLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    doublecomplex* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, SuperLU_MPI_DOUBLE_COMPLEX, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_zscal(lenv, alpha, unzval, 1);
+		    superlu_zaxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	zAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	zAllocLlu(nsupers, LUstruct, grid3d);
+    
+    zLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    zscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    zscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* zp3dScatter */
+
+
+int_t zscatter3dUPanels(int_t nsupers,
+		       zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	doublecomplex * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = doublecomplexMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end zScatter3dUPanels */
+
+
+int_t zscatter3dLPanels(int_t nsupers,
+                       zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = doublecomplexCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* zscatter3dLPanels */
+
+int_t zcollect3dLpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    doublecomplex* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	doublecomplex * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    zcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    zcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t zzeroSetLU(int_t nnodes, int_t* nodeList, zLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    doublecomplex* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(doublecomplex) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    doublecomplex* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(doublecomplex) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t zreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        doublecomplex* Lval_buf, doublecomplex* Uval_buf,
+                        zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    zzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    zzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        zzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		zzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t zgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        zLUValSubBuf_t* LUvsb,
+                        zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    doublecomplex alpha = {0.0, 0.0}, beta = {1.0, 0.0};
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    zzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    zzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    zzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    zzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, zLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    zzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    zreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t zgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    zLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    zgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* zgatherAllFactoredLU */
+
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
index 07404e0d..bd0f7c7c 100644
--- a/SRC/pzdistribute.c
+++ b/SRC/pzdistribute.c
@@ -12,9 +12,10 @@ at the top-level directory.
 /*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * 
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * 
*/ @@ -72,9 +73,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ia, *ja, **ia_send, *index, *itemp = NULL; int_t *ptr_to_send; - doublecomplex *aij, **aij_send, *nzval, *dtemp; + doublecomplex *aij, **aij_send, *nzval, *dtemp = NULL; doublecomplex *nzval_a; doublecomplex asum,asum_tot; int iam, it, p, procs, iam_g; @@ -140,8 +141,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, ABORT("Malloc fails for ia[]."); if ( !(aij = doublecomplexMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); + ja = ia + k; } - ja = ia + k; /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { @@ -169,9 +170,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { - ia_send[p] = &index[i]; + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } @@ -215,7 +216,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); @@ -226,7 +228,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; @@ -245,7 +248,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang + //if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } diff --git a/SRC/pzgsmv.c b/SRC/pzgsmv.c index 0c0838ba..f76c3293 100644 --- a/SRC/pzgsmv.c +++ b/SRC/pzgsmv.c @@ -375,11 +375,11 @@ void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm) int_t *it; doublecomplex *dt; SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); } diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c index 722fc9a9..5decae78 100644 --- a/SRC/pzgssvx.c +++ b/SRC/pzgssvx.c @@ -323,7 +323,7 @@ at the top-level directory. * = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows * of the original matrix to make the diagonal large * relative to the off-diagonal. - * = LargeDiag_APWM: use the parallel approximate-weight perfect + * = LargeDiag_HPWM: use the parallel approximate-weight perfect * matching to permute rows of the original matrix * to make the diagonal large relative to the * off-diagonal. @@ -405,7 +405,7 @@ at the top-level directory. * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * - * o R (double*) dimension (A->nrow) + * o R (double *) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). @@ -413,7 +413,7 @@ at the top-level directory. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * - * o C (double*) dimension (A->ncol) + * o C (double *) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). @@ -489,7 +489,7 @@ at the top-level directory. * * info (output) int* * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -587,13 +587,13 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, /* Test the input parameters. */ *info = 0; Fact = options->Fact; - if ( Fact < 0 || Fact > FACTORED ) + if ( Fact < DOFACT || Fact > FACTORED ) *info = -1; - else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) + else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR ) *info = -1; - else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) + else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC ) *info = -1; - else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA ) + else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA ) *info = -1; else if ( options->IterRefine == SLU_EXTRA ) { *info = -1; @@ -664,6 +664,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -913,7 +914,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif - } else { /* use largeDiag_AWPM */ + } else { /* use LargeDiag_HWPM */ #ifdef HAVE_COMBBLAS z_c2cpp_GetHWPM(A, grid, ScalePermstruct); #else @@ -1060,7 +1061,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) { - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); fflush(stdout); } @@ -1082,10 +1083,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - (int) sizeof(int_t), (int) sizeof(short), - (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1229,11 +1230,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); - if (!iam_g) { - print_options_dist(options); - fflush(stdout); - } - printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz "IFMT " nnzLU %ld sum %e lsum %e N "IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum.r+asum.i, lsum.r+lsum.i, nnz_tot,nnzLU,asum_tot.r+asum_tot.i,lsum_tot.r+lsum_tot.i,A->ncol); fflush(stdout); #endif @@ -1324,7 +1320,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, avg * 1e-6, avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); fflush(stdout); } } /* end printing stats */ @@ -1572,6 +1569,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c new file mode 100644 index 00000000..df80cf00 --- /dev/null +++ b/SRC/pzgssvx3d.c @@ -0,0 +1,1590 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
+ */
+#include "superlu_zdefs.h"
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PZGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PZGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) zScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) doublecomplex* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) zLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (zLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'zLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) zSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_zdefs.h for the definition of 'zSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + zScalePermstruct_t * ScalePermstruct, + doublecomplex B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + zLUstruct_t * LUstruct, zSOLVEstruct_t * SOLVEstruct, + double *berr, SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore = A->Store; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + doublecomplex *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + doublecomplex *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + doublecomplex *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + LUstruct->dt = 'z'; + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_Z || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < Astore->m_loc) + *info = -5; + else if (nrhs < 0) { + *info = -6; + } + if (*info) { + i = -(*info); + pxerr_dist ("pzgssvx3d", grid, -(*info)); + return; + } + + /* Initialization. */ + + + options->Algo3d = YES; + + /* definition of factored seen by each process layer */ + factored = (Fact == FACTORED); + + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d, + so that the names {ldb, B, and Astore} can be used internally. + B3d and Astore3d will be assigned back to B and Astore on return.*/ + int ldb3d = ldb; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + NRformat_loc3d *A3d = SOLVEstruct->A3d; + + /* B3d is aliased to B; + B2d is allocated; + B is then aliased to B2d for the following 2D solve; + */ + zGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d, &A3d); + + B = (doublecomplex *) A3d->B2d; /* B is now pointing to B2d, + allocated in dGatherNRformat_loc3d. */ + //PrintDouble5("after gather B=B2d", ldb, B); + + SOLVEstruct->A3d = A3d; /* This structure need to be persistent across + multiple calls of pdgssvx3d() */ + + NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 + NRformat_loc *A_orig = A->Store; +////// + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter pzgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: + - equilibration, + - ordering, + - symbolic factorization, + - distribution of L & U */ + + if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; // on 2D grid-0 + ldb = Astore0->m_loc; + + /* The following code now works on 2D grid-0 */ + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (doublecomplex *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + /* Extract equilibration status from a previous factorization */ + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else { + rowequ = colequ = FALSE; + } + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ + zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + pzgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pzlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if ( !factored ) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pzCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (doublecomplex *) GAstore->nzval; + GA_mem_use += nnz * sizeof (doublecomplex); + } + + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = zldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + zd_mult(&a[i], &a[i], R1[irow]); + zd_mult(&a[i], &a[i], C1[icol]); +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i])); + else if (job == 4) + dsum += slud_z_abs1(&a[i]); + else if (job == 5) + dprod *= slud_z_abs1(&a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + z_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pzlangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if ( !factored ) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf("{%4d,%4d}: pzgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pzdistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = zdist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + zp3dScatter(n, LUstruct, grid3d); + + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + trf3Dpartition = zinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + pzgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + zgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + z3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + zDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); + + } /* end if not Factored ... factor on all process layers */ + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + zQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if ( (nrhs > 0) && (*info == 0) ) { + if (!(b_work = doublecomplexMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], R[irow]); + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], C[irow]); + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doublecomplexMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pzgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + zSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + pzgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + zSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pzgsmv_finalize (SOLVEstruct->gsmv_comm); + pzgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + doublecomplex at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (zSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(zSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pzgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pzgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + + /* Permute the solution matrix B <= Pc'*X. */ + pzPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], C[irow]); + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], R[irow]); + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs > 0 and factor successful */ + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + + /* Scatter the solution from 2D grid-0 to 3D grid */ + if ( nrhs > 0 ) zScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit pzgssvx3d()"); +#endif + +} diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c index f8819583..29396604 100644 --- a/SRC/pzgssvx_ABglobal.c +++ b/SRC/pzgssvx_ABglobal.c @@ -587,6 +587,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -875,7 +876,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); @@ -898,7 +899,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, printf("\tint %d, short %d, float %d, double %d\n", (int) sizeof(int_t), (int) sizeof(short), (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1097,6 +1098,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } if ( !factored || (factored && options->IterRefine) ) diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index 174ce9ed..a9bf5dd1 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -108,7 +108,6 @@ at the top-level directory. */ #include -/*#include "mkl.h"*/ #include "superlu_zdefs.h" #ifdef GPU_ACC @@ -127,7 +126,7 @@ at the top-level directory. // #define SUPERNODE_PROFILE /* - Name : BAELINE + Name : BASELINE Purpose : baseline to compare performance against Overhead : NA : this won't be used for running experiments */ @@ -766,11 +765,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, SUPERLU_MAX (max_row_size * num_threads * ldt, get_max_buffer_size ()); */ -#ifdef GPU_ACC +#ifdef GPU_ACC /*-------- use GPU --------*/ int gpublas_nb = get_gpublas_nb(); // default 64 - int nstreams = get_num_gpu_streams (); + int nstreams = get_num_gpu_streams (); // default 8 - int buffer_size = SUPERLU_MAX(max_row_size*nstreams*gpublas_nb,get_max_buffer_size()); + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8)); + // get_max_buffer_size()); /* array holding last column blk for each partition, used in SchCompUdt--GPU.c */ #if 0 @@ -782,8 +782,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /* not to use GPU */ int Threads_per_process = get_thread_per_process(); - int_t buffer_size = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size()); -#endif /* end ifdef GPU_ACC */ + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8)); + // get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ int_t max_ncols = 0; #if 0 @@ -810,8 +811,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigV = NULL; #if ( PRNTlevel>=1 ) - if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", - max_row_size, max_ncols); + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); fflush(stdout); #endif @@ -826,16 +831,19 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #endif #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size); + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); fflush(stdout); #endif if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,gpuHostMallocDefault)) ) ABORT("Malloc fails for zgemm buffer V"); - if ( iam==0 )DisplayHeader(); - #if ( PRNTlevel>=1 ) - printf(" Starting with %d GPU Streams \n",nstreams ); + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d GPU Streams \n",nstreams ); + fflush(stdout); + } #endif gpublasHandle_t *handle; @@ -878,10 +886,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, return 1; } - stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) - + bigu_size + buffer_size ) * dword; + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC -#else /*-- not to use GPU --*/ +#else /*-------- not to use GPU --------*/ // for GEMM padding 0 j = bigu_size / ldt; @@ -889,7 +898,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad)); #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size); + printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size); fflush(stdout); #endif @@ -903,7 +912,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, ABORT ("Malloc failed for zgemm V buffer"); //#endif -#endif /* end ifdef GPU_ACC */ +#endif +/*************** end ifdef GPU_ACC ****************/ log_memory((bigv_size + bigu_size) * dword, stat); @@ -1756,29 +1766,29 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm); if ( iam==0 ) { - printf("\nInitialization time\t%8.2lf seconds\n" + printf("\nInitialization time\t%8.4lf seconds\n" "\t Serial: compute static schedule, allocate storage\n", InitTimer); printf("\n==== Time breakdown in factorization (rank 0) ====\n"); - printf("Panel factorization \t %8.2lf seconds\n", + printf("Panel factorization \t %8.4lf seconds\n", pdgstrf2_timer + pdgstrs2_timer); - printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer); - printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer); - printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer); - printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer); - printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); - printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer); - - printf(".. Time in GEMM %8.2lf \n", + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", LookAheadGEMMTimer + RemainGEMMTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer); - printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); - printf(".. Time to Scatter %8.2lf \n", + printf(".. Time to Scatter %8.4lf \n", LookAheadScatterTimer + RemainScatterTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer); - printf("\t* Remain\t %8.2lf \n", RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); - printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer); + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); printf("--------\n"); printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); } diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 593b21d5..46461501 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -213,8 +213,7 @@ pzgstrf2_trsm /* Diagonal pivot */ i = luptr; if ( options->ReplaceTinyPivot == YES ) { - if ( slud_z_abs1(&lusup[i]) < thresh && - lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */ + if ( slud_z_abs1(&lusup[i]) < thresh ) { /* Diagonal */ #if ( PRNTlevel>=2 ) printf ("(%d) .. col %d, tiny pivot %e ", @@ -363,44 +362,35 @@ pzgstrf2_trsm } /* PZGSTRF2_trsm */ -#if 0 /* COMMENT OUT 3D CODE FOR NOW */ /***************************************************************************** * The following functions are for the new pdgstrf2_ztrsm in the 3D code. *****************************************************************************/ static -int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, - doublecomplex* lusup, int_t nsupr, SCT_t* SCT) +int_t LpanelUpdate(int off0, int nsupc, doublecomplex* ublk_ptr, int ld_ujrow, + doublecomplex* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; doublecomplex alpha = {1.0, 0.0}; - unsigned long long t1 = _rdtsc(); + double t1 = SuperLU_timer_(); #define GT 32 +#ifdef _OPENMP #pragma omp parallel for +#endif for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; - int_t len = SUPERLU_MIN(GT, l - i * GT); -#if 1 - #if defined (USE_VENDOR_BLAS) - ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr, - 1, 1, 1, 1); - #else - ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr); - #endif -#else - cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); -#endif + int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } /* for i = ... */ - t1 = _rdtsc() - t1; + t1 = SuperLU_timer_() - t1; - SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; @@ -408,13 +398,30 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u #pragma GCC push_options #pragma GCC optimize ("O0") -/*factorizes the diagonal block; called from process that owns the (k,k) block*/ +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    doublecomplex *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, zLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     doublecomplex alpha = {-1.0, 0.0}, zero = {0.0, 0.0}, one = {1.0, 0.0};
 
@@ -424,8 +431,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     int_t jfst = FstBlockC (k);
     int_t jlst = FstBlockC (k + 1);
     doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
-    int_t nsupc = SuperSize (k);
-    int_t nsupr;
+    int nsupc = SuperSize (k);
+    int nsupr;
     if (Llu->Lrowind_bc_ptr[lk])
         nsupr = Llu->Lrowind_bc_ptr[lk][1];
     else
@@ -433,11 +440,11 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     doublecomplex *ublk_ptr = BlockUFactor;
     doublecomplex *ujrow = BlockUFactor;
     int_t luptr = 0;                  /* Point_t to the diagonal entries. */
-    int_t cols_left = nsupc;          /* supernode size */
+    int cols_left = nsupc;          /* supernode size */
     int_t u_diag_cnt = 0;
     int_t ld_ujrow = nsupc;       /* leading dimension of ujrow */
-    int_t incx = 1;
-    int_t incy = ld_ujrow;
+    int incx = 1;
+    int incy = ld_ujrow;
 
     for (int_t j = 0; j < jlst - jfst; ++j)   /* for each column in panel */
     {
@@ -486,18 +493,11 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
         if (--cols_left)
         {
             /*following must be int*/
-            int_t l = nsupc - j - 1;
+            int l = nsupc - j - 1;
 
 	    /* Rank-1 update */
-#if 1
-	    zgeru_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
-		    &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
-		    &nsupr);
-#else
-            cblas_zgeru (CblasColMajor, l, cols_left, &alpha, &lusup[luptr + 1], incx,
-                        &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
-                        nsupr);
-#endif
+            superlu_zger(l, cols_left, alpha, &lusup[luptr + 1], incx,
+                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr);
             stat->ops[FACT] += 8 * l * cols_left;
         }
 
@@ -508,8 +508,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
 
     //int_t thread_id = omp_get_thread_num();
-    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1);
-}
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
+} /* end Local_Zgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
@@ -714,7 +714,7 @@ int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 
 int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 			      int_t *usub, doublecomplex *uval, doublecomplex *tempv,
-			      int_t knsupc, int_t nsupr, doublecomplex *lusup,
+			      int_t knsupc, int nsupr, doublecomplex *lusup,
 			      Glu_persist_t *Glu_persist)    /*glupersist for xsup for supersize*/
 {
     doublecomplex alpha = {1.0, 0.0};
@@ -727,34 +727,22 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
 
     // printf("klst inside task%d\n", );
     /*find ldu */
-    int_t ldu = 0;
+    int ldu = 0;
     for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
     {
         ldu = SUPERLU_MAX( klst - usub[jj], ldu) ;
     }
 
     /*pack U block into a dense Block*/
-    int_t ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
+    int ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub,
     	                           uval, tempv);
 
     /*now call ztrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
-
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu,
-	     1, 1, 1, 1);
-  #else
-     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu);
-  #endif
-#else
-
-    cblas_ztrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
-                 ldu, ncols, (void*) &alpha, &lusup[luptr], nsupr, tempv, ldu);
-#endif
+    
+    superlu_ztrsm("L", "L", "N", "U", ldu, ncols, alpha,
+		  &lusup[luptr], nsupr, tempv, ldu);
 
     /*now scatter the output into sparse U block*/
     zTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
@@ -762,10 +750,9 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 }
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
-#if 1 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -847,8 +834,10 @@ void pzgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -891,7 +880,9 @@ void pzgstrs2_omp
 #endif
 	    } /* end if segsize > 0 */
 	} /* end for j in parallel ... */
+#ifdef _OPENMP    
 /* #pragma omp taskwait */
+#endif
     }  /* end for b ... */
 
 #ifndef USE_Ublock_info
@@ -914,7 +905,7 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, zLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, doublecomplex *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -935,20 +926,22 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
         zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 } /* pdgstrs2_omp new version from Piyush */
 
-#endif
+#endif /* there are 2 versions of pzgstrs2_omp */
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
new file mode 100644
index 00000000..e3bd65c7
--- /dev/null
+++ b/SRC/pzgstrf3d.c
@@ -0,0 +1,391 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "zlustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PZGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) zLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) zLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_zdefs.h for the definition of 'zLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + zLocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + /* Test the input parameters. */ + *info = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter pzgstrf3d()"); +#endif + + // Initilize stat + stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + +#if 0 // sherry: not used + zdiagFactBufs_t dFBuf; + zinitDiagFactBufs(ldt, &dFBuf); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + msgs_t msgs; + initMsgs(&msgs); +#endif + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + zscuBufs_t scuBufs; + zinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + zLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + + /* Initializing factorization specific buffers */ + + int_t numLA = getNumLookAhead(options); + zLUValSubBuf_t** LUvsbs = zLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + zdiagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); + + /* Setting up GPU related data structures */ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); + + zInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + + int superlu_acc_offload = HyP->superlu_acc_offload; + + //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); + HyP->bigu_size = bigu_size; + int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size (); + HyP->buffer_size = buffer_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + zLUstruct_gpu_t *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + zsluGPU_t sluGPUobj; + zsluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { +#if 0 /* Sherry: For GPU code on titan, we do not need performance + lookup tables since due to difference in CPU-GPU performance, + it didn't make much sense to do any Schur-complement update + on CPU, except for the lookahead-update on CPU. Same should + hold for summit as well. (from Piyush) */ + + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif +#endif + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); + + /* Initialize GPU data structures */ + zinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nGPUStreams = sluGPU->nGPUStreams; + } + +#endif // end GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + zsparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + zsparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + zreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + zreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */ + +#ifdef GPU_ACC + /* This frees the GPU storage allocateed in initSluGPU3D_t() */ + if (superlu_acc_offload) { + zfree_LUstruct_gpu (sluGPU->A_gpu); + } +#endif + + /* Prepare error message - find the smallesr index i that U(i,i)==0 */ + int iinfo; + if ( *info == 0 ) *info = n + 1; + MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm); + if ( iinfo == n + 1 ) *info = 0; + else *info = iinfo; + //printf("After factorization: INFO = %d\n", *info); fflush(stdout); + + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + zfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + zLluBufFreeArr(numLA, LUvsbs); + zfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit pzgstrf3d()"); +#endif + return 0; + +} /* pzgstrf3d */ diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c index 1e99e227..051d6d2f 100644 --- a/SRC/pzgstrs.c +++ b/SRC/pzgstrs.c @@ -2397,6 +2397,8 @@ for (i=0;icomm ); - + // if (!iam) { printf("DBG: pzgstrs: after Barrier\n"); fflush(stdout);} #if ( PROFlevel>=2 ) { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c index 5fa8db11..4e2e52a9 100644 --- a/SRC/pzgstrs_lsum.c +++ b/SRC/pzgstrs_lsum.c @@ -13,14 +13,15 @@ at the top-level directory. * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * 
*/ @@ -539,11 +540,11 @@ void zlsum_fmod_inv for (nn=0;nn=1 ) TOC(t2, t1); @@ -611,156 +612,155 @@ void zlsum_fmod_inv #endif for (lb=lbstart;lb=1 ) - TIC(t1); -#endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; - - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id1]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ + /* + * Send Xk to process column Pc[k]. + */ if(LBtree_ptr[lk].empty_==NO){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - } + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } - /* - * Perform local block modifications. - */ + /* + * Perform local block modifications. + */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif - { +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { - zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); - } + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } - // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ } /* if fmod[lk] == 0 */ } - } + } /* end tasklook for nn ... */ } }else{ @@ -802,18 +802,18 @@ void zlsum_fmod_inv il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd - #endif - for (i = 0; i < nbrow1; ++i) { - irow = lsub[lptr+i] - rel; /* Relative row. */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], - &lsum[il+irow + j*iknsupc+sizelsum*thread_id], - &rtemp_loc[nbrow_ref+i + j*nbrow]); - } + z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ // TOC(t3, t1); @@ -824,147 +824,143 @@ void zlsum_fmod_inv for (lb=0;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; #endif - stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif /* * Send Xk to process column Pc[k]. */ - if(LBtree_ptr[lk].empty_==NO){ + if(LBtree_ptr[lk].empty_==NO){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); - } - - /* - * Perform local block modifications. - */ + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); + } - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) - // #endif + /* + * Perform local block modifications. + */ - { - zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); - } +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + } /* end else iam == p */ } /* if fmod[lk] == 0 */ } // } @@ -973,7 +969,6 @@ void zlsum_fmod_inv stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; - } /* if nlb>0*/ } /* zLSUM_FMOD_INV */ @@ -1157,9 +1152,9 @@ void zlsum_fmod_inv_master il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd lastprivate(irow) - #endif + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ z_sub(&lsum[il+irow + j*iknsupc], @@ -1167,14 +1162,15 @@ void zlsum_fmod_inv_master &rtemp_loc[nbrow_ref+i + j*nbrow]); } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } /* end if (lbstart=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; #endif - } - // TOC(t3, t1); + } /* end else ... */ + // TOC(t3, t1); rtemp_loc = &rtemp[sizertemp* thread_id]; for (lb=0;lb=1 ) TIC(t1); #endif for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_TRSM] += t2; - #endif stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ + + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); @@ -1375,13 +1367,12 @@ void zlsum_fmod_inv_master * Perform local block modifications. */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif { nlb1 = lsub1[0] - 1; - zlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, xsup, grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); @@ -1393,8 +1384,8 @@ void zlsum_fmod_inv_master } // } stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; - } /* if nlb>0*/ -} /* zLSUM_FMOD_INV */ + } /* end if nlb>0*/ +} /* end zlsum_fmod_inv_master */ @@ -1454,7 +1445,7 @@ void zlsum_bmod_inv float msg_vol = 0, msg_cnt = 0; int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; int_t iword = sizeof(int_t); - int_t dword = sizeof (double); + int_t dword = sizeof(double); int_t aln_d,aln_i; aln_d = ceil(CACHELINE/(double)dword); aln_i = ceil(CACHELINE/(double)iword); @@ -1518,9 +1509,9 @@ void zlsum_bmod_inv fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -1531,7 +1522,7 @@ void zlsum_bmod_inv stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); } - } /* for jj ... */ + } /* end for jj ... */ } #if ( PROFlevel>=1 ) @@ -1539,7 +1530,6 @@ void zlsum_bmod_inv stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - #ifdef _OPENMP #pragma omp atomic capture #endif @@ -1551,9 +1541,9 @@ void zlsum_bmod_inv if ( iam != p ) { for (ii=1;ii=1 ) TIC(t1); #endif - for (ii=1;iiops[SOLVE] += 8 * (iklrow - fnz); + } + stat[thread_id]->ops[SOLVE] += 8 * (iklrow - fnz); } } /* for jj ... */ } @@ -1745,9 +1734,9 @@ void zlsum_bmod_inv if ( iam != p ) { for (ii=1;ii16){ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied - // #endif +// if(Urbs[lk1]>16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif // zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, // stat, root_send, nroot_send, sizelsum,sizertemp); //}else{ - zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); //} // } /* if brecv[ik] == 0 */ } } /* if bmod[ik] == 0 */ - } /* for ub ... */ - } + } /* end for ub ... */ + } /* end else ... */ } /* zlSUM_BMOD_inv */ @@ -2009,9 +1998,9 @@ void zlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -2056,9 +2045,9 @@ void zlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -2097,9 +2086,9 @@ void zlsum_bmod_inv_master if ( iam != p ) { for (ii=1;iiLrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); -#if 0 // Sherry: the following is not allocated with cudaHostAlloc +#if 0 // Sherry: the following is not allocated with gpuHostAlloc //#ifdef GPU_ACC - checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); + checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i])); #endif SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); } @@ -940,27 +940,53 @@ int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, */ void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct) { - int_t *it; - - pxgstrs_finalize(SOLVEstruct->gstrs_comm); - - if ( options->RefineInitialized ) { - pzgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; + if ( options->SolveInitialized ) { + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + + if ( options->RefineInitialized ) { + pzgsmv_finalize(SOLVEstruct->gsmv_comm); + options->RefineInitialized = NO; + } + SUPERLU_FREE(SOLVEstruct->gsmv_comm); + SUPERLU_FREE(SOLVEstruct->row_to_proc); + SUPERLU_FREE(SOLVEstruct->inv_perm_c); + SUPERLU_FREE(SOLVEstruct->diag_procs); + SUPERLU_FREE(SOLVEstruct->diag_len); + if ( SOLVEstruct->A_colind_gsmv ) + SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); + options->SolveInitialized = NO; } - SUPERLU_FREE(SOLVEstruct->gsmv_comm); - SUPERLU_FREE(SOLVEstruct->row_to_proc); - SUPERLU_FREE(SOLVEstruct->inv_perm_c); - SUPERLU_FREE(SOLVEstruct->diag_procs); - SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); - options->SolveInitialized = NO; } /* zSolveFinalize */ +void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d) +{ + /* free A2d and B2d, which are allocated only in 2D layer grid-0 */ + NRformat_loc3d *A3d = SOLVEstruct->A3d; + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE( A2d->rowptr ); + SUPERLU_FREE( A2d->colind ); + SUPERLU_FREE( A2d->nzval ); + } + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts + SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); + SUPERLU_FREE(A3d->procs_to_send_list); + SUPERLU_FREE(A3d->send_count_list); + SUPERLU_FREE(A3d->procs_recv_from_list); + SUPERLU_FREE(A3d->recv_count_list); + SUPERLU_FREE( A2d ); // free 2D structure + SUPERLU_FREE( A3d ); // free 3D structure +} /* zDestroy_A3d_gathered_on_2d */ + + /*! \brief Check the inf-norm of the error vector */ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx, - doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) + doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; doublecomplex *x_work, *xtrue_work; @@ -980,8 +1006,8 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); diff --git a/SRC/sSchCompUdt-2Ddynamic.c b/SRC/sSchCompUdt-2Ddynamic.c new file mode 100644 index 00000000..f4dc643e --- /dev/null +++ b/SRC/sSchCompUdt-2Ddynamic.c @@ -0,0 +1,714 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This file contains the main loop of pdgstrf which involves rank k + * update of the Schur complement. + * Uses 2D partitioning for the scatter phase. + * + *
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 14, 2017
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros),
+ *     gather only trailing columns of the L-panel corresponding to the nonzero
+ *     of U-rows.
+ *   - Padding zeros for nice dimensions of GEMM.
+ *
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ */
+
+#define SCHEDULE_STRATEGY guided
+
+/*
+ * Buffers:
+ *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ *                                            (A matrix in C := A*B )
+ *     bigU : stores the U-panel (B matrix in C := A*B)
+ *     bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
+
+if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+    int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
+    int temp_nbrow;   /* nonzero rows in current block L(i,k) */
+    lptr  = lptr0;
+    luptr = luptr0;
+    int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+			   and remaining part.  */
+
+    /*******************************************************************
+     * Separating L blocks into the top part within look-ahead window
+     * and the remaining ones.
+     *******************************************************************/
+
+     int lookAheadBlk=0, RemainBlk=0;
+
+     tt_start = SuperLU_timer_();
+
+     /* Sherry -- can this loop be threaded?? */
+     /* Loop through all blocks in L(:,k) to set up pointers to the start
+      * of each block in the data arrays.
+      *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
+      *   - lookAheadStRow[i] := number of nonzero rows before block i
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[]
+      *   - (ditto Remain_Info[i])
+      */
+     for (int i = 0; i < nlb; ++i) {
+	 ib = lsub[lptr];            /* Block number of L(i,k). */
+	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+
+	 int look_up_flag = 1; /* assume ib is outside look-up window */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+	      ++j) {
+		 if ( ib == perm_c_supno[j] ) {
+		     look_up_flag = 0; /* flag ib within look-up window */
+                     break;            /* Sherry -- can exit the loop?? */
+                 }
+	 }
+
+	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
+	     if (lookAheadBlk==0) {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
+	     } else {
+		 lookAheadFullRow[lookAheadBlk] =
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
+	     }
+	     lookAheadStRow[lookAheadBlk] = cum_nrow;
+	     lookAhead_lptr[lookAheadBlk] = lptr;
+	     lookAhead_ib[lookAheadBlk] = ib;
+	     lookAheadBlk++;
+	 } else { /* ib is not in look-up window */
+	     if ( RemainBlk==0 ) {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow;
+	     } else {
+		 Remain_info[RemainBlk].FullRow =
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;
+	     }
+             RemainStRow[RemainBlk] = cum_nrow;
+             // Remain_lptr[RemainBlk] = lptr;
+	     Remain_info[RemainBlk].lptr = lptr;
+	     // Remain_ib[RemainBlk] = ib;
+	     Remain_info[RemainBlk].ib = ib;
+	     RemainBlk++;
+	 }
+
+         cum_nrow += temp_nbrow;
+
+	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+	 lptr += temp_nbrow;     /* Move to next block */
+	 luptr += temp_nbrow;
+     }  /* for i ... set up pointers for all blocks in L(:,k) */
+
+     lptr = lptr0;
+     luptr = luptr0;
+
+     /* leading dimension of L look-ahead buffer, same as Lnbrow */
+     //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+     Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+     /* leading dimension of L remaining buffer, same as Rnbrow */
+     //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+     /* Piyush fix */
+     //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+     nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
+     LookAheadRowSepMOP += 2*knsupc*(nbrow);
+
+     /***********************************************
+      * Gather U blocks (AFTER LOOK-AHEAD WINDOW)   *
+      ***********************************************/
+     tt_start = SuperLU_timer_();
+
+     if ( nbrow > 0 ) { /* L(:,k) is not empty */
+	 /*
+	  * Counting U blocks
+	  */
+     	 ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
+	 int temp_ncols = 0;
+
+	 /* jj0 contains the look-ahead window that was updated in
+	    dlook_ahead_update.c. Now the search can continue from that point,
+	    not to start from block 0. */
+#if 0 // Sherry comment out 5/21/2018
+	 /* Save pointers at location right after look-ahead window
+	    for later restart. */
+	 iukp0 = iukp;
+	 rukp0 = rukp;
+#endif
+
+	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+
+         /*
+	  * Loop through all blocks in U(k,:) to set up pointers to the start
+          * of each block in the data arrays, store them in Ublock_info[j]
+          * for block U(k,j).
+  	  */
+	 for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+	     temp_ncols = 0;
+#if 1
+	     /* Cannot remove following call, since perm_u != Identity  */
+	     arrive_at_ublock(
+			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			      iukp0, rukp0, usub, perm_u, xsup, grid
+			      );
+#else
+	     jb = usub[iukp];
+	     /* ljb = LBj (jb, grid);   Local block number of U(k,j). */
+	     nsupc = SuperSize(jb);
+	     iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+	     Ublock_info[j].iukp = iukp;
+	     Ublock_info[j].rukp = rukp;
+	     Ublock_info[j].jb = jb;
+
+	     /* if ( iam==0 )
+		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+			"Ublock_info[j].jb %d, nsupc %d\n",
+			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+			Ublock_info[j].jb, nsupc); */
+
+	     /* Prepare to call GEMM. */
+	     jj = iukp;
+	     for (; jj < iukp+nsupc; ++jj) {
+		 segsize = klst - usub[jj];
+		 if ( segsize ) {
+                    ++temp_ncols;
+                    if ( segsize > ldu ) ldu = segsize;
+		 }
+	     }
+
+	     Ublock_info[j].full_u_cols = temp_ncols;
+	     ncols += temp_ncols;
+#if 0 // Sherry comment out 5/31/2018 */
+	     /* Jump number of nonzeros in block U(k,jj);
+		Move to block U(k,j+1) in nzval[] array.  */
+	     rukp += usub[iukp - 1];
+	     iukp += nsupc;
+#endif
+         } /* end for j ... compute ldu & ncols */
+
+	 /* Now doing prefix sum on full_u_cols.
+	  * After this, full_u_cols is the number of nonzero columns
+          * from block 0 to block j.
+          */
+	 for ( j = jj0+1; j < nub; ++j) {
+	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
+	 }
+
+	 /* Padding zeros to make {m,n,k} multiple of vector length. */
+	 jj = 8; //n;
+	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+	     gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+	     gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+	     //gemm_n_pad = ncols;
+	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+	     gemm_k_pad = ldu;
+
+	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
+		 for (j = 0; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = 0; i < Rnbrow; ++i)
+		 for (j = ldu; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
+		 for (j = 0; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	     for (i = 0; i < ldu; ++i)
+		 for (j = ncols; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	 } else {
+	     gemm_m_pad = Rnbrow;
+	     gemm_n_pad = ncols;
+	     gemm_k_pad = ldu;
+	 }
+
+	 tempu = bigU; /* buffer the entire row block U(k,:) */
+
+         /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+    private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+    default (shared) schedule(SCHEDULE_STRATEGY)
+#endif
+        for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+
+            if (j==jj0) tempu = bigU;
+            //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+            else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
+
+            /* == processing each of the remaining columns in parallel == */
+#if 0
+	    /* Can remove following call, since search was already done.  */
+            arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			     iukp0, rukp0, usub,perm_u, xsup, grid);
+#else
+	    iukp = Ublock_info[j].iukp;
+	    rukp = Ublock_info[j].rukp;
+	    jb = Ublock_info[j].jb;
+	    nsupc = SuperSize (jb );
+#endif
+            /* Copy from U(k,j) to tempu[], padding zeros.  */
+            for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+                    lead_zero = ldu - segsize;
+                    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+		    //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+		    for (i = 0; i < segsize; ++i)
+                    	tempu[i+lead_zero] = uval[rukp+i];
+                    rukp += segsize;
+                    tempu += gemm_k_pad;
+                }
+	    }
+        }   /* parallel for j = jj0 .. nub */
+
+#if 0
+	if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+	fflush(stdout);
+#endif
+
+        GatherMOP += 2*ldu*ncols;
+
+    }  /* end if (nbrow>0), end gather U blocks */
+
+    GatherUTimer += SuperLU_timer_() - tt_start;
+    int jj_cpu = nub;       /* limit between CPU and GPU */
+    int thread_id;
+    /*tempv = bigV;*/
+
+    /**********************
+     * Gather L blocks    *
+     **********************/
+     tt_start = SuperLU_timer_();
+
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+     for (i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest, temp_nbrow;
+	 if ( i==0 ) {
+	     StRowDest = 0;
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+
+	 int StRowSource = lookAheadStRow[i];
+
+	 /* Now copying one block into L lookahead buffer */
+	 /* #pragma omp parallel for (gives slow down) */
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+						    corresponding to zero U rows */
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+	     //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+	     tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+	     tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+	     memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+		    &lusup[luptr+j*nsupr + StRowSource],
+		    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... gather Lval blocks from lookahead window */
+
+     /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+    schedule(SCHEDULE_STRATEGY)
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+         int StRowDest, temp_nbrow;
+         if ( i==0 )  {
+	     StRowDest  = 0;
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+	 }
+
+	 int StRowSource = RemainStRow[i];
+
+	 /* Now copying a block into L remaining buffer */
+	 // #pragma omp parallel for (gives slow down)
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (int j = knsupc-ldu; j < knsupc; ++j) {
+	     // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+	     //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+	     tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+	     tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+	     memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+		    &lusup[luptr+j*nsupr + StRowSource],
+                    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... copy Lval into the remaining buffer */
+
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
+
+
+     /*************************************************************************
+      * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+      *************************************************************************/
+     tempu = bigU;  /* setting to the start of padded U(k,:) */
+
+     if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+	 /***************************************************************
+	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+	  ***************************************************************/
+
+	 /* Count flops for total GEMM calls */
+	 ncols = Ublock_info[nub-1].full_u_cols;
+ 	 flops_t flps = 2.0 * (flops_t)Lnbrow * ldu * ncols;
+	 LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+	 schur_flop_counter += flps;
+	 stat->ops[FACT]    += flps;
+	 LookAheadGEMMFlOp  += flps;
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id)
+	 {
+#ifdef _OPENMP	 
+	   thread_id = omp_get_thread_num();
+#else	   
+	   thread_id = 0;
+#endif
+
+	   /* Ideally, should organize the loop as:
+	      for (j = 0; j < nub; ++j) {
+	          for (lb = 0; lb < lookAheadBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+		  }
+	      }
+	      But now, we use collapsed loop to achieve more parallelism.
+	      Total number of block updates is:
+	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+	   */
+
+	   int i = sizeof(int);
+	   int* indirect_thread    = indirect + (ldt + CACHELINE/i) * thread_id;
+	   int* indirect2_thread   = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	   thread_id = 0;
+	   int* indirect_thread    = indirect;
+	   int* indirect2_thread   = indirect2;
+#endif
+	   /* Each thread is assigned one loop index ij, responsible for
+	      block update L(lb,k) * U(k,j) -> tempv[]. */
+	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	       /* jj0 starts after look-ahead window. */
+            int j   = ij/lookAheadBlk + jj0;
+            int lb  = ij%lookAheadBlk;
+
+            /* Getting U block U(k,j) information */
+            /* unsigned long long ut_start, ut_end; */
+            int_t rukp =  Ublock_info[j].rukp;
+            int_t iukp =  Ublock_info[j].iukp;
+            int jb   =  Ublock_info[j].jb;
+            int nsupc = SuperSize(jb);
+            int ljb = LBj (jb, grid);  /* destination column block */
+            int st_col;
+            int ncols;  /* Local variable counts only columns in the block */
+            if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
+                ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+                st_col = Ublock_info[j-1].full_u_cols;
+            } else {
+                ncols  = Ublock_info[j].full_u_cols;
+                st_col = 0;
+            }
+
+            /* Getting L block L(i,k) information */
+            int_t lptr = lookAhead_lptr[lb];
+            int ib   = lookAhead_ib[lb];
+            int temp_nbrow = lsub[lptr+1];
+            lptr += LB_DESCRIPTOR;
+            int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+
+	    /* Block-by-block GEMM in look-ahead window */
+#if 0
+	    i = sizeof(float);
+	    float* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+	    float* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
+#if ( PRNTlevel>= 1)
+	    if (thread_id == 0) tt_start = SuperLU_timer_();
+	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
+	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+
+#if (PRNTlevel>=1 )
+	    if (thread_id == 0) {
+		tt_end = SuperLU_timer_();
+		LookAheadGEMMTimer += tt_end - tt_start;
+		tt_start = tt_end;
+	    }
+#endif
+            if ( ib < jb ) {
+                sscatter_u (
+				 ib, jb,
+				 nsupc, iukp, xsup,
+				 klst, temp_nbrow,
+				 lptr, temp_nbrow, lsub,
+				 usub, tempv1,
+				 Ufstnz_br_ptr, Unzval_br_ptr,
+				 grid
+			        );
+            } else {
+#if 0
+		//#ifdef USE_VTUNE
+	    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	    __itt_resume(); // start VTune, again use 2 underscores
+#endif
+                sscatter_l (
+				 ib, ljb,
+				 nsupc, iukp, xsup,
+ 				 klst, temp_nbrow,
+				 lptr, temp_nbrow,
+				 usub, lsub, tempv1,
+				 indirect_thread, indirect2_thread,
+				 Lrowind_bc_ptr, Lnzval_bc_ptr,
+				 grid
+				);
+#if 0
+		//#ifdef USE_VTUNE
+		__itt_pause(); // stop VTune
+		__SSC_MARK(0x222); // stop SDE tracing
+#endif
+            }
+
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0)
+		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+	   } /* end omp for ij = ... */
+
+#ifdef _OPENMP
+	 } /* end omp parallel */
+#endif
+     } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
+    /***************************************************************
+     * Updating remaining rows and columns on CPU.
+     ***************************************************************/
+    ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+	double flps = 2.0 * (double)Rnbrow * ldu * ncols;
+	schur_flop_counter  += flps;
+	stat->ops[FACT]     += flps;
+
+#if ( PRNTlevel>=1 )
+	RemainGEMM_flops += flps;
+	gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+	gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+	tt_start = SuperLU_timer_();
+	/* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+	   iam, k0,Rnbrow,ldu,ncols,RemainBlk);  fflush(stdout);
+	assert( Rnbrow*ncols < bigv_size ); */
+#endif
+	/* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
+
+#if ( PRNTlevel>=1 )
+	tt_end = SuperLU_timer_();
+	RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+	//fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+	// (tt_end - tt_start)*1e6); // time in microsecond
+	//fflush(fgemm);
+	gemm_stats[gemm_count].m = Rnbrow;
+	gemm_stats[gemm_count].n = ncols;
+	gemm_stats[gemm_count].k = ldu;
+	gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+	tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* Scatter into destination block-by-block. */
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(thread_id)
+	{
+#ifdef _OPENMP	
+	    thread_id = omp_get_thread_num();
+#else	    
+	    thread_id = 0;
+#endif
+
+	    /* Ideally, should organize the loop as:
+               for (j = 0; j < jj_cpu; ++j) {
+	           for (lb = 0; lb < RemainBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+                   }
+               }
+	       But now, we use collapsed loop to achieve more parallelism.
+	       Total number of block updates is:
+	       (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	    */
+
+	    int i = sizeof(int);
+	    int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+	    int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	    thread_id = 0;
+	    int* indirect_thread = indirect;
+	    int* indirect2_thread = indirect2;
+#endif
+	    /* Each thread is assigned one loop index ij, responsible for
+	       block update L(lb,k) * U(k,j) -> tempv[]. */
+	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+		/* jj_cpu := nub, jj0 starts after look-ahead window. */
+		int j   = ij / RemainBlk + jj0; /* j-th block in U panel */
+		int lb  = ij % RemainBlk;       /* lb-th block in L panel */
+
+		/* Getting U block U(k,j) information */
+		/* unsigned long long ut_start, ut_end; */
+		int_t rukp =  Ublock_info[j].rukp;
+		int_t iukp =  Ublock_info[j].iukp;
+		int jb   =  Ublock_info[j].jb;
+		int nsupc = SuperSize(jb);
+		int ljb = LBj (jb, grid);
+		int st_col;
+		int ncols;
+		if ( j>jj0 ) {
+		    ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+		    st_col = Ublock_info[j-1].full_u_cols;
+		} else {
+		    ncols = Ublock_info[j].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* Getting L block L(i,k) information */
+		int_t lptr = Remain_info[lb].lptr;
+		int ib   = Remain_info[lb].ib;
+		int temp_nbrow = lsub[lptr+1];
+		lptr += LB_DESCRIPTOR;
+		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
+		float* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+		/* Now scattering the block */
+
+		if ( ib < jb ) {
+		    sscatter_u (
+				ib, jb,
+				nsupc, iukp, xsup,
+				//klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+				klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+				lptr, temp_nbrow, /* row dimension of the block */
+				lsub, usub, tempv1,
+				Ufstnz_br_ptr, Unzval_br_ptr,
+				grid
+				);
+		} else {
+		    sscatter_l(
+			       ib, ljb,
+			       nsupc, iukp, xsup,
+			       //klst, temp_nbrow, Sherry
+			       klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+			       lptr, temp_nbrow, /* row dimension of the block */
+			       usub, lsub, tempv1,
+			       indirect_thread, indirect2_thread,
+			       Lrowind_bc_ptr,Lnzval_bc_ptr,
+			       grid
+			       );
+		}
+
+	    } /* end omp for (int ij =...) */
+
+#ifdef _OPENMP
+	} /* end omp parallel region */
+#endif
+
+#if ( PRNTlevel>=1 )
+	RemainScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
+#endif
+
+    } /* end if Rnbrow>0 ... update remaining block */
+
+}  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/sSchCompUdt-cuda.c b/SRC/sSchCompUdt-cuda.c
new file mode 100644
index 00000000..579cc6ea
--- /dev/null
+++ b/SRC/sSchCompUdt-cuda.c
@@ -0,0 +1,589 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief This file contains the main loop of psgstrf which involves
+ *        rank k update of the Schur complement.
+ *        Uses CUDA GPU.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY dynamic
+
+#define gpublasCheckErrors(fn) \
+    do { \
+        gpublasStatus_t __err = fn; \
+        if (__err != GPUBLAS_STATUS_SUCCESS) { \
+            fprintf(stderr, "Fatal gpublas error: %d (at %s:%d)\n", \
+                (int)(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while(0);
+
+int full;
+double gemm_timer = 0.0;
+double scatter_timer = 0.0;
+
+if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
+    ldu   =0;
+    full  =1;
+    int cum_nrow;
+    int temp_nbrow;
+
+    lptr = lptr0;
+    luptr = luptr0;
+
+    nbrow= lsub[1];
+    if (myrow==krow) nbrow = lsub[1]-lsub[3];
+
+    if (nbrow>0) {
+
+        // Maximum number of columns that can fit in dC[buffer_size] on GPU 
+#if 0   // max_ldu can be < ldt, so bigu_size/ldt may be smaller, giving false alarm
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
+#else // Sherry fix
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow, max_ncols);
+#endif
+	
+        int num_streams_used, /* number of streams that will be used*/
+        ncpu_blks;            /* the leading number of CPU dgemm blks
+			         in each partition */
+        int jjj, jjj_st,jjj_global;
+        for (j = jj0; j < nub; ++j) {
+            arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
+
+            ncols =0 ;  //initialize at 0
+            jj = iukp;
+            int temp_ldu=0;
+            for (; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+		    ++ncols;
+		}
+                temp_ldu = SUPERLU_MAX(temp_ldu, segsize);
+            }
+
+            full_u_cols[j] = ncols;
+            blk_ldu[j] = temp_ldu;
+        } /* end for j = jj0..nub */
+
+        jjj = jj0; /* jj0 is the first block column after look-ahead window */
+
+        // #pragma omp barrier
+        while ( jjj < nub ) {
+            jjj_st=jjj;
+#ifdef _OPENMP
+#pragma omp single
+#endif
+            {
+                ldu = blk_ldu[jjj_st];
+                for (j = jjj_st; j < nub ; ++j) {
+
+                    /* prefix sum */
+                    if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
+
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);
+
+                    /* break condition */
+                    /* the number of columns that can be processed on GPU is
+		       limited by buffer size */
+                    if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
+                        break; // block column j+1 does not fit in GPU memory */
+                    }
+                } /* end for j=jjj_st to nub */
+
+                jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj < nub */
+
+                // TAU_STATIC_TIMER_START("work_divison");
+                /* Divide CPU-GPU gemm here.
+		 * If there is only one block, we leave it on CPU.
+		 */
+                gemm_division_cpu_gpu(
+		       &num_streams_used,/*number of streams that will be used*/
+		       stream_end_col,   /*array holding last column blk for each partition*/
+		       &ncpu_blks,       /*number of CPU gemm blks*/
+		       		// Following are inputs
+		       nbrow,            /*number of rows in A matrix*/
+		       ldu,              /*value of k in dgemm*/
+		       nstreams,
+		       full_u_cols + jjj_st, /*array containing prefix sum of GPU workload*/
+		       jjj_global - jjj_st /*number of block columns on GPU.
+		       		             If only one block, leave it on CPU*/
+                );
+                // TAU_STATIC_TIMER_STOP("work_divison");
+
+            } /* pragma omp single */
+
+            jjj = jjj_global; /* Move to the next [ CPU : GPU ] partition */
+	    
+#if 0 // !!Sherry: this test is not necessary
+	    // if jjj_global - jjj_st == 1, everything is on CPU.
+	    // bigv_size is calculated sufficiently large.
+            if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
+                printf("allocate more memory for buffer !!!!\n"
+		       ".. jjj_st %d, nbrow %d, full_u_cols[jjj_st] %d, ncol_max %d\n",
+		       jjj_st, nbrow, full_u_cols[jjj_st], ncol_max);
+                if(nbrow * full_u_cols[jjj_st] > buffer_size)
+                    printf("[%d] needed %d > buffer_size %d\n",iam,nbrow*full_u_cols[jjj_st],buffer_size );
+		fflush(stdout);
+            }
+#endif
+
+            // #pragma omp barrier
+            /* gathering circuit */
+            assert(jjj_st 0 ) {
+#ifdef PI_DEBUG
+		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size ); fflush(stdout);
+		assert(nbrow*ldu<=ldt*max_row_size);
+#endif
+		gpuMemcpy2DAsync(dA, nbrow*sizeof(float),
+				  &lusup[luptr+(knsupc-ldu)*nsupr],
+				  nsupr*sizeof(float), nbrow*sizeof(float),
+				  ldu, gpuMemcpyHostToDevice, streams[0]);
+	    }
+
+	    for (int i = 0; i < num_streams_used; ++i) { // streams on GPU
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1];
+		// st starts after the leading ncpu_blks
+		int st_col = full_u_cols[st-1];
+		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
+		tempu = bigU;
+
+		float *tempv1 = bigV + full_u_cols[st-1]*nbrow;
+
+		/* Following is for testing purpose */
+		if ( num_col_stream > 0 ) {		
+#ifdef GPU_ACC
+		    int stream_id = i;
+		    int b_offset  = ldu * st_col;
+		    int c_offset  = st_col * nbrow;
+		    size_t B_stream_size = ldu * num_col_stream * sizeof(float);
+		    size_t C_stream_size = nbrow * num_col_stream * sizeof(float);
+
+		    assert(nbrow*(st_col+num_col_stream) < buffer_size);
+
+		    gpuMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
+		    		    gpuMemcpyHostToDevice, streams[stream_id]);
+
+		    gpublasCheckErrors(
+				  gpublasSetStream(handle[stream_id],
+						  streams[stream_id])
+				     );
+
+		    gpublasCheckErrors(
+				  gpublasSgemm(handle[stream_id],
+					      GPUBLAS_OP_N, GPUBLAS_OP_N,
+					      nbrow, num_col_stream, ldu,
+                                              &alpha, dA, nbrow,
+					      &dB[b_offset], ldu,
+					      &beta, &dC[c_offset],
+                                              nbrow)
+				  );
+
+		    checkGPU( gpuMemcpyAsync(tempv1, dC+c_offset,
+					   C_stream_size,
+					   gpuMemcpyDeviceToHost,
+					   streams[stream_id]) );
+#else /*-- on CPU --*/
+
+	            my_sgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
+			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
+			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
+			      tempv1, &nbrow, 1, 1);
+#endif
+   	        } // end if num_col_stream > 0
+
+	    } /* end for i = 1 to num_streams used */
+
+	    /* Special case for CPU -- leading block columns are computed 
+	       on CPU in order to mask the GPU data transfer latency */
+	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
+	    int st_col = 0; /* leading part on CPU */
+	    tempv = bigV + nbrow * st_col;
+	    tempu = bigU;
+
+	    double tstart = SuperLU_timer_();
+#if defined (USE_VENDOR_BLAS)
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
+#else
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow);
+#endif
+	    gemm_timer += SuperLU_timer_() -tstart;
+	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
+
+            /* Now scattering blocks computed by CPU */
+            int temp_ncol;
+
+            /* scatter leading blocks which CPU has computated */
+            tstart = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel  \
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,	\
+	    segsize,lead_zero,					\
+	    ib, temp_nbrow,ilst,lib,index,			\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,			\
+	    nzval,     lb ,                     jj, i)		\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+		int num_threads = omp_get_num_threads();
+#else
+                int thread_id = 0;
+		int num_threads = 1;
+#endif		
+
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+
+                if ( ncpu_blks < num_threads ) {
+                    // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
+
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_st, jjj_st+ncpu_blks\n",j,jjj_st,jjj_st+ncpu_blks);
+#endif
+
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+#endif
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            int cum_nrow = 0;
+                            int temp_nbrow;
+                            lptr = lptr0;
+                            luptr = luptr0;
+                            for (int i = 0; i < lb; ++i) {
+                                ib = lsub[lptr];        /* Row block L(i,k). */
+                                temp_nbrow = lsub[lptr+1];   /* Number of full rows. */
+                                lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+                                lptr += temp_nbrow;
+                                luptr += temp_nbrow;
+                                cum_nrow +=temp_nbrow;
+                            }
+
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) {  /* A(i,j) is in U. */
+                                #ifdef PI_DEBUG
+                                    printf("cpu scatter \n");
+                                    printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+                                #endif
+
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                    // TAU_STATIC_TIMER_STOP("SPECIAL_CPU_SCATTER");
+                } else { // ncpu_blks >= omp_get_num_threads()
+#ifdef _OPENMP
+#pragma omp for schedule(SCHEDULE_STRATEGY) nowait
+#endif
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d\n",j);
+#endif
+
+                        /* == processing each of the remaining columns == */
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+			    /* Now gather the result into the destination block. */
+			    if ( ib < jb ) {  /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("cpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+
+				tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+			    } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+			    } /* if ib < jb ... */
+
+			    lptr += temp_nbrow;
+			    luptr += temp_nbrow;
+			    cum_nrow += temp_nbrow;
+
+			} /* for lb ... */
+
+			luptr=luptr0;
+		    } /* for j = jjj_st ... */
+		}     /* else (ncpu_blks >= omp_get_num_threads()) */
+	    }         /* parallel region */
+
+	    scatter_timer += SuperLU_timer_() - tstart;
+	    
+	    // Scatter tempv(:, (jjj_st1 : jjj_global)) computed on GPU.
+#ifdef _OPENMP
+#pragma omp parallel							\
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
+	    segsize,lead_zero,						\
+	    ib, temp_nbrow,ilst,lib,index,				\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,				\
+	    nzval,     lb ,                     jj, i)			\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+                for(i = 0; i < num_streams_used; i++) { /* i is private variable */
+                    checkGPU(gpuStreamSynchronize (streams[i]));
+		    // jjj_st1 := first block column on GPU stream[i]
+		    int jjj_st1 = (i==0) ? jjj_st + ncpu_blks : jjj_st + stream_end_col[i-1];
+                    int jjj_end = jjj_st + stream_end_col[i];
+                    assert(jjj_end-1jjj_st) ;
+
+                    /* now scatter it */
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+                    for (j = jjj_st1; j < jjj_end; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_end %d, nub %d\n",j,jjj_end,nub); fflush(stdout);
+#endif
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th
+			   block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+                        for (lb = 0; lb < nlb; lb++) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+                            /* Now scatter result into destination block. */
+                            if ( ib < jb ) { /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("gpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("gpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                } /* end for i = 0 to nstreams */
+		
+                // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
+                // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
+		
+            } /* end pragma omp parallel */
+            // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
+	    
+        }  /* end while(jjj0 */
+
+ }   /* if msg1 and msg 2 */
+
+
+
diff --git a/SRC/sbinary_io.c b/SRC/sbinary_io.c
new file mode 100644
index 00000000..b1a3c81a
--- /dev/null
+++ b/SRC/sbinary_io.c
@@ -0,0 +1,42 @@
+#include "superlu_sdefs.h"
+
+int
+sread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, 
+	     float **nzval, int_t **rowind, int_t **colptr)
+{
+    size_t isize = sizeof(int_t), dsize = sizeof(float);
+    int nnz_read;
+    fread(n, isize, 1, fp);
+    fread(nnz, isize, 1, fp);
+    printf("fread n " IFMT "\tnnz " IFMT "\n", *n, *nnz);
+    *m = *n;
+    *colptr = intMalloc_dist(*n+1);
+    *rowind = intMalloc_dist(*nnz);
+    *nzval  = floatMalloc_dist(*nnz);
+    fread(*colptr, isize, (size_t) (*n + 1), fp);
+    fread(*rowind, isize, (size_t) *nnz, fp);
+    nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
+    printf("# of floats fread: %d\n", nnz_read);
+    fclose(fp);
+    return 0;
+}
+
+int
+swrite_binary(int_t n, int_t nnz,
+	      float *values, int_t *rowind, int_t *colptr)
+{       
+      FILE  *fp1;
+      int nnz_written;
+      size_t isize = sizeof(int_t), dsize = sizeof(float);
+      fp1 = fopen("matrix.bin", "wb");
+      fwrite(&n, isize, 1, fp1);
+      fwrite(&nnz, isize, 1, fp1);
+      fwrite(colptr, isize, n+1, fp1);
+      fwrite(rowind, isize, nnz, fp1);
+      nnz_written = fwrite(values, dsize, nnz, fp1);
+      printf("n " IFMT ", # of float: " IFMT "\n", n, nnz);
+      printf("dump binary file ... # of float fwrite: %d\n", nnz_written);
+      assert(nnz_written==nnz);
+      fclose(fp1);
+      return 0;
+}
diff --git a/SRC/scan.cu b/SRC/scan.cu
new file mode 100644
index 00000000..5a23509c
--- /dev/null
+++ b/SRC/scan.cu
@@ -0,0 +1,194 @@
+#include 
+#include 
+
+// typedef float pfx_dtype ; 
+
+int nextpow2(int v)
+
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+
+    return v;
+}
+
+__device__ int dnextpow2(int v)
+
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+
+    return v;
+}
+
+
+
+typedef int pfx_dtype ; 
+__global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n)
+{
+    extern __shared__ pfx_dtype temp[];
+    int n_original = n;
+    n = (n & (n - 1)) == 0? n: dnextpow2(n);
+    int thread_id = threadIdx.x;
+    int offset = 1;
+    if(2*thread_id  < n_original)
+        temp[2*thread_id] = inArr[2*thread_id]; 
+    else 
+        temp[2*thread_id] =0;
+
+
+    if(2*thread_id+1 >1; d > 0; d >>= 1) 
+    {
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            temp[bi] += temp[ai];
+        }
+        offset *= 2;
+    }
+    
+    if (thread_id == 0) { temp[n - 1] = 0; } 
+    for (int d = 1; d < n; d *= 2) 
+    {
+        offset >>= 1;
+        __syncthreads();
+        if (thread_id < d)
+        {
+            int ai = offset*(2*thread_id+1)-1;
+            int bi = offset*(2*thread_id+2)-1;
+            pfx_dtype t = temp[ai];
+            temp[ai] = temp[bi];
+            temp[bi] += t;
+        }
+    }
+    __syncthreads();
+    if(2*thread_id  < n_original)
+    outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory
+    if(2*thread_id+1  < n_original)
+    outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1];
+    __syncthreads();
+    if(2*thread_id  < n_original)
+    printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]);
+    if(2*thread_id+1  < n_original)
+    printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]);
+    __syncthreads();
+} 
+
+#define SELF_TEST 
+#ifdef SELF_TEST
+
+#include 
+#include "cub/cub.cuh"
+
+#define THREAD_BLOCK_SIZE 8
+
+
+// __global__
+// void cub_scan_test(int N)
+// {
+// 	int thread_id = threadIdx.x;
+// 	typedef cub::BlockScan BlockScan; /*1D int data type*/
+
+// 	__shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+// 	extern __shared__ int* IndirectJ1;
+// 	extern __shared__ int* IndirectJ2= IndirectJ1+ N*sizeof(int);
+
+// 	if (thread_id < N)
+// 	{
+// 		IndirectJ1[thread_id] = 2*thread_id +1;
+// 	}
+
+// 	__syncthreads();
+// 	if (thread_id < THREAD_BLOCK_SIZE)
+// 		BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+// 	if (thread_id < THREAD_BLOCK_SIZE)
+// 		printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+// }
+
+
+
+// extern __shared__
+// #define THREAD_BLOCK_SIZE 7
+
+__global__ void initData(pfx_dtype* A, int n)
+{
+    int threadId = threadIdx.x;   
+    if(threadId>> (A,N);
+    if(cudaDeviceSynchronize() != cudaSuccess)
+        std::cout<<"Error- 0\n";
+    // prescan<<<  1,THREAD_BLOCK_SIZE/2,2*THREAD_BLOCK_SIZE*sizeof(pfx_dtype) >>> (xA, A, N);
+    
+    prescan<<<  1,(N+1)/2,2*N*sizeof(pfx_dtype) >>> (xA, A, N);
+    prescan<<<  1,N2,2*N*sizeof(pfx_dtype) >>> (xA, A, N);
+    if(cudaDeviceSynchronize() != cudaSuccess)
+        std::cout<<".....EXITING\n";   
+    else
+        std::cout<<"No errors reported\n";
+
+
+    // typedef cub::BlockScan BlockScan; /*1D int data type*/
+	// __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+    // cub_scan_test <<<  1,THREAD_BLOCK_SIZE >>> (N);
+
+    return 0;
+}
+
+#endif 
\ No newline at end of file
diff --git a/SRC/scatter.c b/SRC/scatter.c
new file mode 100644
index 00000000..46926955
--- /dev/null
+++ b/SRC/scatter.c
@@ -0,0 +1,589 @@
+#include "superlu_ddefs.h"
+#include "scatter.h"
+//#include "compiler.h"
+
+#define ISORT
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+block_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+    thread_id = omp_get_thread_num();
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int_t ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    /* Getting L block information */
+    // int_t lptr = Remain_info[lb].lptr;
+    // int_t ib   = Remain_info[lb].ib;
+    // int_t temp_nbrow = lsub[lptr + 1];
+    // lptr += LB_DESCRIPTOR;
+    // int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+
+    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter */
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+block_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+
+        }
+
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter_lock */
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t block_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2, HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopLeft */
+
+int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopRight */
+
+
+int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    //printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomLeft */
+
+int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+   // printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: The following routines may conflict with dscatter.c
+ ******************************************************************/
+#if 1
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+}
+
+void   // SHERRY: NOT CALLED!!
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+#endif // comment out
+
diff --git a/SRC/scatter.h b/SRC/scatter.h
new file mode 100644
index 00000000..568a1687
--- /dev/null
+++ b/SRC/scatter.h
@@ -0,0 +1,147 @@
+#ifndef _SCATTER_H_
+#define _SCATTER_H_
+
+#ifdef CLEAN_SCATTER
+#define SCATTER_L_CPU  scatter_l
+#define SCATTER_U_CPU  scatter_u
+#else
+#define SCATTER_L_CPU  scatter_l
+#define SCATTER_U_CPU  scatter_u
+
+#endif
+
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid);
+
+void
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid);
+
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid);
+
+
+void
+block_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  );
+
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+block_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       );
+
+int_t block_gemm_scatterTopLeft( int_t lb,  int_t j,
+                                 double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                 int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                 HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               );
+int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                );
+int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  );
+int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   );
+
+void gather_u(int_t num_u_blks,
+              Ublock_info_t *Ublock_info, int_t * usub,
+              double *uval,  double *bigU,  int_t ldu,
+              int_t *xsup, int_t klst                /* for SuperSize */
+             );
+
+void gather_l( int_t num_LBlk, int_t knsupc,
+               Remain_info_t *L_info,
+               double * lval, int_t LD_lval,
+               double * L_buff );
+#endif
diff --git a/SRC/scommunication_aux.c b/SRC/scommunication_aux.c
new file mode 100644
index 00000000..55e83851
--- /dev/null
+++ b/SRC/scommunication_aux.c
@@ -0,0 +1,504 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t sIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t sIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, float* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], MPI_FLOAT,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t sBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     float* uval, gridinfo_t *grid,
+		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, float* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, sLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_FLOAT, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t sIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, float* Uval_buf, sLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t sWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sISend_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, MPI_FLOAT, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, MPI_FLOAT, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sPackLBlock(int_t k, float* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, sLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    float *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(float) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t sISend_LDiagBlock(int_t k0, float *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, MPI_FLOAT, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sIRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, MPI_FLOAT, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIRecv_LDiagBlock(int_t k0, float *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, MPI_FLOAT, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t sIBcast_UDiagBlock(int_t k, float *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, MPI_FLOAT, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t sIBcast_LDiagBlock(int_t k, float *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, MPI_FLOAT, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t sUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/sdistribute.c b/SRC/sdistribute.c
new file mode 100644
index 00000000..964f7ce4
--- /dev/null
+++ b/SRC/sdistribute.c
@@ -0,0 +1,1652 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Distribute the matrix onto the 2D process mesh.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * 
+ */ +#include "superlu_sdefs.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * 
+ */ + +float +sdistribute(fact_t fact, int_t n, SuperMatrix *A, + Glu_freeable_t *Glu_freeable, + sLUstruct_t *LUstruct, gridinfo_t *grid) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NCPformat *Astore; + float *a; + int_t *asub; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, sword; + float mem_use = 0.0; + + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + int tag; + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + sword = sizeof(float); +//#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sdistribute()"); +#endif + + if ( fact == SamePattern_SameRowPerm ) { + /* --------------------------------------------------------------- + * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. + * --------------------------------------------------------------- */ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = floatCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + Unnz = Llu->Unnz; + + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*sword; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa_begin[j]; i < xa_end[j]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ +#if ( PROFlevel>=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + } /* for jb ... */ + + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); +#endif + + } else { + /* -------------------------------------------------- + * FIRST TIME CREATING THE L AND U DATA STRUCTURE. + * -------------------------------------------------- */ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* No L and U data structures are available yet. + We need to set up the L and U data structures and propagate + the values of A into them. */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); + + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); + + mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; + + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + + + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); +#endif + + mem_use -= 2.0*k * iword; + + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if (!(dense=floatCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) + *sp_ienv_dist(3))))) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); +#if ( PRNTlevel>=1 ) + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*sword; +#endif + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + if ( !(Unnz = + (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + ABORT("Malloc fails for Unnz[]."); + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ + for (i = xa_begin[j]; i < xa_end[j]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } + + jbrow = PROW( jb, grid ); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) )) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + /* Propagate the compressed row subscripts to Lindex[], and + the initial values of A from SPA into Lnzval[]. */ + + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + + } /* for jb ... */ + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s')); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); +#endif + + } /* else fact != SamePattern_SameRowPerm */ + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit sdistribute()"); +#endif + + return (mem_use); +} /* SDISTRIBUTE */ + diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c new file mode 100644 index 00000000..db73cb4d --- /dev/null +++ b/SRC/sec_structs.c @@ -0,0 +1,665 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Auxiliary routines in 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + + +//#include +#include "superlu_ddefs.h" +#if 0 +#include "sec_structs.h" +#include /*for Qsort */ +#include +#include /*for sqrt*/ +#include +#include "compiler.h" +//#include "load-balance/supernodal_etree.h" +#include "supernodal_etree.h" +#endif + +#include /*for printfs*/ +#include /*for getline*/ + +double CPU_CLOCK_RATE; +/*for sorting structures */ +int Cmpfunc_R_info (const void * a, const void * b) +{ + return ( ((Remain_info_t*)a)->nrows - ((Remain_info_t*)b)->nrows ); +} + + + +int Cmpfunc_U_info (const void * a, const void * b) +{ + return ( ((Ublock_info_t*)a)->ncols - ((Ublock_info_t*)b)->ncols ); +} + + +int sort_R_info( Remain_info_t* Remain_info, int n ) +{ + qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info); + + return 0; +} + +int sort_U_info( Ublock_info_t* Ublock_info, int n ) +{ + qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info); + + return 0; +} + +int Cmpfunc_R_info_elm (const void * a, const void * b) +{ + return ( ((Remain_info_t*)a)->eo - ((Remain_info_t*)b)->eo ); +} + + + +int Cmpfunc_U_info_elm (const void * a, const void * b) +{ + return ( ((Ublock_info_t*)a)->eo - ((Ublock_info_t*)b)->eo ); +} + + + +int sort_R_info_elm( Remain_info_t* Remain_info, int n ) +{ + /*sorts on the basis of order of elimination*/ + qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info_elm); + + return 0; +} + +int sort_U_info_elm( Ublock_info_t* Ublock_info, int n ) +{ + qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info_elm); + + return 0; +} + +double *SCT_ThreadVarInit(int_t num_threads) +{ +#if 0 + double *var = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64); +#else + double *var = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE); +#endif + for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i) + { + var[i] = 0.0; + } + return var; +} + + +#define DEFAULT_CPU_FREQ 3000.0 // 3 GHz + +double getFreq(void) +{ + FILE *fp = fopen("/proc/cpuinfo", "rb"); + if(!fp) { + // the file /proc/cpuinfo doesn't exists, return 3000 Mhz as the frequency +#if ( PRNTlevel>=2 ) + printf("/proc/cpuinfo doesn't exists, using 3GHz as CPU frequency. Some timers will not be correct\n"); +#endif + return DEFAULT_CPU_FREQ; + } + + char *arg = 0; +#if 1 + size_t len = 0; + char *line = NULL; +#else + size_t len = 100; // Sherry fix + char *line = SUPERLU_MALLOC(len * sizeof(char)); +#endif + size_t read; + while ((read = getline(&line, &len, fp)) != -1) + { + // printf("%s", line); + char * pch; + pch = strtok (line, " \t:"); + if (pch != NULL && strcmp(pch, "cpu") == 0) + { + + /* code */ + pch = strtok (NULL, " \t:"); + // printf("%s\n", pch ); + if (pch != NULL && strcmp(pch, "MHz") == 0) + { + pch = strtok (NULL, " \t:"); + double freq = atof(pch); + free(arg); + fclose(fp); + return freq; + + break; + } + } + free(line); + line = NULL; + } + + //SUPERLU_FREE(line); // sherry added + return 0; +} + +/* Initialize various counters. */ +void SCT_init(SCT_t* SCT) +{ +#if 1 + CPU_CLOCK_RATE = getFreq() * 1e-3; +#else + CPU_CLOCK_RATE = 3000. * 1e-3; +#endif + int num_threads = 1; + +#ifdef _OPENMP +#pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + } + } +#endif + + SCT->acc_load_imbal = 0.0; + + /* Counter for couting memory operations */ + SCT->scatter_mem_op_counter = 0.0; + SCT->scatter_mem_op_timer = 0.0; +#ifdef SCATTER_PROFILE + SCT->Host_TheadScatterMOP = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64); + SCT->Host_TheadScatterTimer = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64); + memset(SCT->Host_TheadScatterMOP, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8))); + memset(SCT->Host_TheadScatterTimer, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8))); +#endif + + SCT->LookAheadRowSepTimer = 0.0; + SCT->LookAheadRowSepMOP = 0.0; + SCT->GatherTimer = 0.0; + SCT->GatherMOP = 0.0; + SCT->LookAheadGEMMTimer = 0.0; + SCT->LookAheadGEMMFlOp = 0.0; + SCT->LookAheadScatterTimer = 0.0; + SCT->LookAheadScatterMOP = 0.0; + SCT->AssemblyTimer = 0.0; + + SCT->offloadable_flops = 0.0; + SCT->offloadable_mops = 0.0; + +#if 0 + SCT->SchurCompUdtThreadTime = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64); +#else + SCT->SchurCompUdtThreadTime = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE); +#endif + + for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i) + { + SCT->SchurCompUdtThreadTime[i] = 0.0; + } + + SCT->schur_flop_counter = 0.0; + SCT->schur_flop_timer = 0.0; + + SCT->datatransfer_timer = 0; + SCT->schurPhiCallTimer = 0; + SCT->schurPhiCallCount = 0; + SCT->datatransfer_count = 0; + SCT->PhiWaitTimer = 0; + SCT->PhiWaitTimer_2 = 0; + SCT->NetSchurUpTimer = 0; + SCT->PhiMemCpyTimer = 0; + SCT->PhiMemCpyCounter = 0; + + SCT->pdgstrs2_timer = 0.0; + SCT->trf2_flops = 0; + SCT->trf2_time = 0; + SCT->CPUOffloadTimer = 0; + SCT->pdgstrf2_timer = 0.0; + SCT->lookaheadupdatetimer = 0; + + /* diagonal block factorization; part of pdgstrf2*/ + // SCT->Local_Dgstrf2_tl = 0; + SCT->Local_Dgstrf2_Thread_tl = SCT_ThreadVarInit(num_threads); + /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */ + SCT->Wait_UDiagBlock_Recv_tl = 0; + /*wait for receiving L diagonal block: part of mpf*/ + SCT->Wait_LDiagBlock_Recv_tl = 0; + SCT->Recv_UDiagBlock_tl = 0; + /*wait for previous U block send to finish; part of pdgstrf2 */ + SCT->Wait_UDiagBlockSend_tl = 0; + /*after obtaining U block, time spent in calculating L panel;part of pdgstrf2*/ + SCT->L_PanelUpdate_tl = 0; + /*Synchronous Broadcasting U panel*/ + SCT->Bcast_UPanel_tl = 0; + SCT->Bcast_LPanel_tl = 0; + /*Wait for L send to finish */ + SCT->Wait_LSend_tl = 0; + + /*Wait for U send to finish */ + SCT->Wait_USend_tl = 0; + /*Wait for U receive */ + SCT->Wait_URecv_tl = 0; + /*Wait for L receive */ + SCT->Wait_LRecv_tl = 0; + + /*U_panelupdate*/ + SCT->PDGSTRS2_tl = 0; + + /*profiling by phases*/ + SCT->Phase_Factor_tl = 0; + SCT->Phase_LU_Update_tl = 0; + SCT->Phase_SC_Update_tl = 0; + + /*time to get the lock*/ + SCT->GetAijLock_Thread_tl = SCT_ThreadVarInit(num_threads); + + /*3d timers*/ + SCT->ancsReduce = 0.0; + SCT->gatherLUtimer = 0.0; + + for (int i = 0; i < MAX_3D_LEVEL; ++i) + { + /* code */ + SCT->tFactor3D[i] = 0; + SCT->tSchCompUdt3d[i] = 0; + } + + SCT->tAsyncPipeTail = 0.0; + SCT->tStartup =0.0; + + SCT->commVolFactor =0.0; + SCT->commVolRed =0.0; +} /* SCT_init */ + +void SCT_free(SCT_t* SCT) +{ +#ifdef SCATTER_PROFILE + free(SCT->Host_TheadScatterMOP); + free(SCT->Host_TheadScatterTimer); +#endif +#if 0 + _mm_free(SCT->SchurCompUdtThreadTime); + _mm_free(SCT->Local_Dgstrf2_Thread_tl); + _mm_free(SCT->GetAijLock_Thread_tl); +#else + SUPERLU_FREE(SCT->SchurCompUdtThreadTime); + SUPERLU_FREE(SCT->Local_Dgstrf2_Thread_tl); + SUPERLU_FREE(SCT->GetAijLock_Thread_tl); +#endif + SUPERLU_FREE(SCT); // sherry added +} + + +void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid) +/* +Prints average of the value across all the MPI ranks; +Displays as function_name \t value \t units; +*/ +{ + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} + +void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d) +/* +Prints average of the value across all the MPI ranks; +Displays as function_name \t value \t units; +*/ +{ + int iam = grid3d->iam; + int num_procs = grid3d->nprow * grid3d->npcol * grid3d->npdep; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} + +void DistPrintMarkupHeader(char* headerTitle, double value, gridinfo_t* grid) +{ + + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + + if (!iam) + { + printf("#### %s : %10.4f \n\n", headerTitle,sum / num_procs ); + printf("|Function name \t| avg \t| min \t| max \t| std-dev| units|\n"); + printf("|---|---|---|---|---|---|\n"); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} +void DistPrintThreaded(char* function_name, double* value, double Norm, int_t num_threads, char* Units, gridinfo_t* grid) +/* +Prints average of the value across all the MPI ranks, for threaded variables; +First averages over all the threads; +Norm is normalizing constant +Displays as function_name \t value \t units; +*/ +{ + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double local_sum = 0; + for (int i = 0; i < num_threads ; ++i) + { + local_sum += value[i * CACHE_LINE_SIZE]; + } + + local_sum = local_sum / (Norm * num_threads); + double sum; + double min = 0; + double max = 0; + double value_squared = local_sum * local_sum; + double sum_value_squared; + + MPI_Reduce( &local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &local_sum, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &local_sum, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%% %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } +} + + +/*for mkl_get_blocks_frequency*/ +// #include "mkl.h" +void SCT_print(gridinfo_t *grid, SCT_t* SCT) +{ + int num_threads = 1; + +#ifdef _OPENMP +#pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + } + } +#endif + CPU_CLOCK_RATE = 1e9 * CPU_CLOCK_RATE; + + int iam = grid->iam; + int_t num_procs = grid->npcol * grid->nprow; + double temp_holder; + MPI_Reduce( &SCT->NetSchurUpTimer, &temp_holder, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + if (!iam) + { + printf("CPU_CLOCK_RATE %.1f\n", CPU_CLOCK_RATE ); + printf("Total time in factorization \t: %5.2lf\n", SCT->pdgstrfTimer); + printf("MPI-communication phase \t: %5.2lf\n", SCT->pdgstrfTimer - (temp_holder / num_procs)); + + } + + /* Printing Panel factorization profile*/ + // double CPU_CLOCK_RATE = 1e9 * mkl_get_clocks_frequency(); + + + // DistPrint("Local_Dgstrf2", SCT->Local_Dgstrf2_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrintThreaded( + // "Local_Dgstrf2 ", SCT->Local_Dgstrf2_Thread_tl, CPU_CLOCK_RATE, num_threads, + // "Seconds", grid); + + // DistPrint("Wait_UDiagBlock_Recv ", SCT->Wait_UDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Wait_LDiagBlock_Recv ", SCT->Wait_LDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Recv_UDiagBlock ", SCT->Recv_UDiagBlock_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Wait_UDiagBlockSend ", SCT->Wait_UDiagBlockSend_tl / CPU_CLOCK_RATE, "Seconds", grid); + + // DistPrint("Bcast_UPanel ", SCT->Bcast_UPanel_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Bcast_LPanel ", SCT->Bcast_LPanel_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_LSend ", SCT->Wait_LSend_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_USend ", SCT->Wait_USend_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_URecv ", SCT->Wait_URecv_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_LRecv ", SCT->Wait_LRecv_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("L_PanelUpdate ", SCT->L_PanelUpdate_tl , "Seconds", grid); + DistPrint("PDGSTRS2 ", SCT->PDGSTRS2_tl , "Seconds", grid); + + DistPrint("wait-FunCallStream ", SCT->PhiWaitTimer , "Seconds", grid); + DistPrint("wait-copyStream ", SCT->PhiWaitTimer_2 , "Seconds", grid); + DistPrint("waitGPU2CPU ", SCT->PhiWaitTimer , "Seconds", grid); + DistPrint("SchurCompUpdate ", SCT->NetSchurUpTimer, "Seconds", grid); + DistPrint("PanelFactorization ", SCT->pdgstrfTimer - SCT->NetSchurUpTimer, "Seconds", grid); + + // DistPrint("Phase_Factor ", SCT->Phase_Factor_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Phase_LU_Update ", SCT->Phase_LU_Update_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Phase_SC_Update ", SCT->Phase_SC_Update_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrintThreaded( + // "GetAijLock ", SCT->GetAijLock_Thread_tl, CPU_CLOCK_RATE, num_threads, + // "Seconds", grid); + double t_total = SCT->tStartup + SCT->pdgstrfTimer + SCT->gatherLUtimer; + DistPrintMarkupHeader("High Level Time Breakdown", t_total, grid); + DistPrint("Startup ", SCT->tStartup, "Seconds", grid); + DistPrint("Main-Factor loop ", SCT->pdgstrfTimer, "Seconds", grid); + DistPrint("3D-GatherLU ", SCT->gatherLUtimer, "Seconds", grid); + DistPrint("tTotal ", t_total, "Seconds", grid); + + DistPrintMarkupHeader("Components of Factor Loop",SCT->pdgstrfTimer, grid); + DistPrint("3D-AncestorReduce ", SCT->ancsReduce, "Seconds", grid); + DistPrint("Pipeline Tail ", SCT->tAsyncPipeTail, "Seconds", grid); + +} + +void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + + gridinfo_t* grid = &(grid3d->grid2d); + + char funName[100]; + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int i = maxLvl-1; i >-1; --i) + { + /* code */ + sprintf( funName, "Grid-%d Factor:Level-%d ", grid3d->zscp.Iam, + (int) maxLvl-1-i); + DistPrint(funName, SCT->tFactor3D[i], "Seconds", grid); + // sprintf( funName, "SchurCU:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid); + // sprintf( funName, "PanelFact:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid); + } + +} + + +void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + + gridinfo_t* grid = &(grid3d->grid2d); + char funName[100]; + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int i = maxLvl-1; i >-1; --i) + { + /* code */ + double tsum; + MPI_Reduce( &SCT->tFactor3D[i], &tsum, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm ); + + double tmax; + MPI_Reduce( &SCT->tFactor3D[i], &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->zscp.comm ); + + double tavg = tsum /(grid3d->zscp.Np>>i); + double lLmb = 100*(tmax-tavg)/tavg; + sprintf( funName, "Imbalance Factor:Level-%d ", (int) maxLvl-1-i); + if(!grid3d->zscp.Iam) + DistPrint(funName, lLmb, "Seconds", grid); + // sprintf( funName, "SchurCU:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid); + // sprintf( funName, "PanelFact:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid); + } + +} + + +void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + // + double cvolFactor; + MPI_Reduce( &SCT->commVolFactor, &cvolFactor, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + double cvolRed; + MPI_Reduce( &SCT->commVolRed, &cvolRed, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + + int_t Np = (grid3d->npcol) * (grid3d->nprow) * (grid3d->npdep); + if (!grid3d->iam) + { + /* code */ + printf("| commVolRed | %g | %g |\n", cvolRed, cvolRed/Np ); + printf("| commVolFactor | %g | %g |\n", cvolFactor, cvolFactor/Np ); + } + +} + +int +get_acc_offload () +{ + char *ttemp; + ttemp = getenv ("SUPERLU_ACC_OFFLOAD"); + + if (ttemp) + return atoi (ttemp); + else + return 0; +} + + +void Free_HyP(HyP_t* HyP) +{ +#if 0 + _mm_free(HyP->lookAhead_info ); + _mm_free(HyP->Remain_info ); + _mm_free(HyP->lookAhead_L_buff ); + _mm_free(HyP->Remain_L_buff ); + _mm_free(HyP->Ublock_info ); + _mm_free(HyP->Ublock_info_Phi ); + _mm_free(HyP->Lblock_dirty_bit ); + _mm_free(HyP->Ublock_dirty_bit ); +#else + SUPERLU_FREE(HyP->lookAhead_info ); + SUPERLU_FREE(HyP->Remain_info ); + SUPERLU_FREE(HyP->lookAhead_L_buff ); + SUPERLU_FREE(HyP->Remain_L_buff ); + SUPERLU_FREE(HyP->Ublock_info ); + SUPERLU_FREE(HyP->Ublock_info_Phi ); + SUPERLU_FREE(HyP->Lblock_dirty_bit ); + SUPERLU_FREE(HyP->Ublock_dirty_bit ); +#endif + SUPERLU_FREE(HyP); +} + +int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid) +{ + for (int_t i = 0; i < HyP->RemainBlk; ++i) + { + int_t lib = LBi( HyP->Remain_info[i].ib, grid) ; + HyP->Ublock_dirty_bit[lib] = k0; + } + + + for (int_t j = 0; j < HyP->jj_cpu; ++j) + { + int_t ljb = LBj( HyP->Ublock_info_Phi[j].jb, grid) ; + HyP->Lblock_dirty_bit[ljb] = k0; + } + return 0; +} + +int_t scuStatUpdate( + int_t knsupc, + HyP_t* HyP, + SCT_t* SCT, + SuperLUStat_t *stat + ) +{ + int_t Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + int_t Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + int_t nbrow = Lnbrow + Rnbrow; + int_t ncols_host = HyP->num_u_blks == 0 ? 0 : HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + int_t ncols_Phi = HyP->num_u_blks_Phi == 0 ? 0 : HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + int_t ncols = ncols_Phi+ ncols_host; + // int_t ncols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols + // + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; // ### + SCT->LookAheadRowSepMOP += 2 * (double)knsupc * (double)(nbrow); + SCT->GatherMOP += 2 * (double)HyP->ldu * (double)ncols; + + + SCT->LookAheadGEMMFlOp += 2 * ((double)Lnbrow * (double)HyP->ldu * (double)ncols_host + + (double)Lnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + SCT->LookAheadScatterMOP += 3 * Lnbrow * ncols; + SCT->schur_flop_counter += 2 * ((double)Rnbrow * (double)HyP->ldu * (double)ncols_host + + (double)Rnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + SCT->scatter_mem_op_counter += 3 * Rnbrow * ncols; + stat->ops[FACT] += 2 * ((double)(Rnbrow + Lnbrow) * (double)HyP->ldu * (double)ncols_host + + (double)(Rnbrow + Lnbrow) * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + + return 0; + +} diff --git a/SRC/sgather.c b/SRC/sgather.c new file mode 100644 index 00000000..7bbc5498 --- /dev/null +++ b/SRC/sgather.c @@ -0,0 +1,398 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Various gather routines. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include 
+#include "superlu_sdefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t sprintMatrix(char*s, int n, int m, float* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; ixsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    sgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    sgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // sprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // sprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void sRgather_U( int_t k, int_t jj0, int_t *usub,	float *uval,
+                 float *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    sgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    sgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* sRgather_U */
diff --git a/SRC/sgsequ_dist.c b/SRC/sgsequ_dist.c
new file mode 100644
index 00000000..a133fe1c
--- /dev/null
+++ b/SRC/sgsequ_dist.c
@@ -0,0 +1,204 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file sgsequ_dist.c
+ * \brief Computes row and column scalings
+ *
+ * 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from LAPACK routine SGEEQU
+ * 
+ */ +/* + * History: Modified from LAPACK routine SGEEQU + */ +#include +#include "superlu_sdefs.h" + + + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   SGSEQU_DIST computes row and column scalings intended to equilibrate an   
+ *   M-by-N sparse matrix A and reduce its condition number. R returns the row
+ *   scale factors and C the column scale factors, chosen to try to make   
+ *   the largest element in each row and column of the matrix B with   
+ *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+ *
+ *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+ *   number and BIGNUM = largest safe number.  Use of these scaling   
+ *   factors is not guaranteed to reduce the condition number of A but   
+ *   works well in practice.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input) SuperMatrix*
+ *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
+ *           factors are to be computed. The type of A can be:
+ *           Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE.
+ *	    
+ *   R       (output) float*, size A->nrow
+ *           If INFO = 0 or INFO > M, R contains the row scale factors   
+ *           for A.
+ *	    
+ *   C       (output) float*, size A->ncol
+ *           If INFO = 0,  C contains the column scale factors for A.
+ *	    
+ *   ROWCND  (output) float*
+ *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+ *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+ *           AMAX is neither too large nor too small, it is not worth   
+ *           scaling by R.
+ *	    
+ *   COLCND  (output) float*
+ *           If INFO = 0, COLCND contains the ratio of the smallest   
+ *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+ *           worth scaling by C.
+ *	    
+ *   AMAX    (output) float*
+ *           Absolute value of largest matrix element.  If AMAX is very   
+ *           close to overflow or very close to underflow, the matrix   
+ *           should be scaled.
+ *	    
+ *   INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *           > 0:  if INFO = i,  and i is   
+ *                 <= A->nrow:  the i-th row of A is exactly zero   
+ *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
+ *
+ *   ===================================================================== 
+ * 
+ */ +void +sgsequ_dist(SuperMatrix *A, float *r, float *c, float *rowcnd, + float *colcnd, float *amax, int_t *info) +{ + + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float rcmin, rcmax; + float bignum, smlnum; + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerr_dist("sgsequ_dist", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Get machine constants. */ + smlnum = smach_dist("S"); /* slamch_("S"); */ + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* sgsequ_dist */ + + diff --git a/SRC/slangs_dist.c b/SRC/slangs_dist.c new file mode 100644 index 00000000..0d81c5f5 --- /dev/null +++ b/SRC/slangs_dist.c @@ -0,0 +1,130 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file slangs_dist.c + * \brief Returns the value of the one norm, the infinity norm, or the element of largest value + * Modified from SuperLU routine SLANGS + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * 
+ */ +/* + * File name: slangs_dist.c + * History: Modified from lapack routine SLANGE + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   SLANGS_DIST returns the value of the one norm, or the Frobenius norm, or 
+ *   the infinity norm, or the element of largest absolute value of a 
+ *   real matrix A.   
+ *
+ *   Description   
+ *   ===========   
+ *
+ *   SLANGE returns the value   
+ *
+ *      SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+ *               (   
+ *               ( norm1(A),         NORM = '1', 'O' or 'o'   
+ *               (   
+ *               ( normI(A),         NORM = 'I' or 'i'   
+ *               (   
+ *               ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+ *
+ *   where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+ *   normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+ *   normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+ *   squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   NORM    (input) CHARACTER*1   
+ *           Specifies the value to be returned in SLANGE as described above.   
+ *   A       (input) SuperMatrix*
+ *           The M by N sparse matrix A. 
+ *
+ *  =====================================================================
+ * 
+ */ + +float slangs_dist(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float value = 0.0, sum; + float *rwork; + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if ( (strncmp(norm, "M", 1)==0 ) ) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, fabs( Aval[i]) ); + + } else if (strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } + + } else if (strncmp(norm, "I", 1)==0) { + /* Find normI(A). */ + if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += fabs(Aval[i]); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0) { + /* Find normF(A). */ + ABORT("Not implemented."); + } else + ABORT("Illegal norm specified."); + + return (value); + +} /* slangs_dist */ + diff --git a/SRC/slaqgs_dist.c b/SRC/slaqgs_dist.c new file mode 100644 index 00000000..bef736b5 --- /dev/null +++ b/SRC/slaqgs_dist.c @@ -0,0 +1,154 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file slaqgs_dist.c + * \brief Equlibrates a general sprase matrix + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Modified from LAPACK routine SLAQGE
+ * 
+ */ +/* + * File name: slaqgs_dist.c + * History: Modified from LAPACK routine SLAQGE + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   SLAQGS_DIST equilibrates a general sparse M by N matrix A using the row and   
+ *   scaling factors in the vectors R and C.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input/output) SuperMatrix*
+ *           On exit, the equilibrated matrix.  See EQUED for the form of 
+ *           the equilibrated matrix. The type of A can be:
+ *	    Stype = NC; Dtype = SLU_S; Mtype = GE.
+ *	    
+ *   R       (input) float*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *	    
+ *   C       (input) float*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *	    
+ *   ROWCND  (input) float
+ *           Ratio of the smallest R(i) to the largest R(i).
+ *	    
+ *   COLCND  (input) float
+ *           Ratio of the smallest C(i) to the largest C(i).
+ *	    
+ *   AMAX    (input) float
+ *           Absolute value of largest matrix entry.
+ *	    
+ *   EQUED   (output) char*
+ *           Specifies the form of equilibration that was done.   
+ *           = 'N':  No equilibration   
+ *           = 'R':  Row equilibration, i.e., A has been premultiplied by  
+ *                   diag(R).   
+ *           = 'C':  Column equilibration, i.e., A has been postmultiplied  
+ *                   by diag(C).   
+ *           = 'B':  Both row and column equilibration, i.e., A has been
+ *                   replaced by diag(R) * A * diag(C).   
+ *
+ *   Internal Parameters   
+ *   ===================   
+ *
+ *   THRESH is a threshold value used to decide if row or column scaling   
+ *   should be done based on the ratio of the row or column scaling   
+ *   factors.  If ROWCND < THRESH, row scaling is done, and if   
+ *   COLCND < THRESH, column scaling is done.   
+ *
+ *   LARGE and SMALL are threshold values used to decide if row scaling   
+ *   should be done based on the absolute size of the largest matrix   
+ *   element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+ *
+ *   ===================================================================== 
+ * 
+ */ + +void +slaqgs_dist(SuperMatrix *A, float *r, float *c, + float rowcnd, float colcnd, float amax, char *equed) +{ + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float large, small, cj; + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = smach_dist("Safe minimum") / smach_dist("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + Aval[i] *= cj; + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= r[irow]; + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= cj * r[irow]; + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* slaqgs_dist */ + diff --git a/SRC/sldperm_dist.c b/SRC/sldperm_dist.c new file mode 100644 index 00000000..6178c1b0 --- /dev/null +++ b/SRC/sldperm_dist.c @@ -0,0 +1,175 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include "superlu_sdefs.h" + +extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], + int_t*, int_t [], int_t*, int_t[], int_t*, double [], + int_t [], int_t []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   SLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal
+ *              entries of the permuted matrix are one in absolute value and
+ *              all the off-diagonal entries are less than or equal to one in
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) float*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors.
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors.
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * 
+ */ + +int +sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + float nzval[], int_t *perm, float u[], float v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + extern double *doubleMalloc_dist(int_t); + double *nzval_d = doubleMalloc_dist(nnz); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Enter sldperm_dist()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + PrintInt10("colptr", n+1, colptr); + PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_dist(icntl); + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; + + for (i = 0; i < nnz; ++i) nzval_d[i] = nzval[i]; + mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval_d, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last " IFMT " permutations:\n", n-num); + PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Exit sldperm_dist()"); +#endif + return (info[0]); +} + diff --git a/SRC/slook_ahead_update.c b/SRC/slook_ahead_update.c new file mode 100644 index 00000000..5a6999cd --- /dev/null +++ b/SRC/slook_ahead_update.c @@ -0,0 +1,278 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/************************************************************************/ +/*! @file + * \brief Look-ahead update of the Schur complement. + * + *
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *  September 18, 2017
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ *
+ */
+
+#include   /* assertion doesn't work if NDEBUG is defined */
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+j = jj0 = 0;  /* After the j-loop, jj0 points to the first block in U
+                 outside look-ahead window. */
+
+#if 0
+for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
+
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    float zero = 0.0;
+
+#if 1
+    /* Search is needed because a permutation perm_u is involved for j  */
+    /* Search along the row for the pointers {iukp, rukp} pointing to
+     * block U(k,j).
+     * j    -- current block in look-ahead window, initialized to 0 on entry
+     * iukp -- point to the start of index[] metadata
+     * rukp -- point to the start of nzval[] array
+     * jb   -- block number of block U(k,j), update destination column
+     */
+    arrive_at_ublock(
+		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         	     iukp0, rukp0, usub, perm_u, xsup, grid
+		    );
+#else
+    jb = usub[iukp];
+    ljb = LBj (jb, grid);     /* Local block number of U(k,j). */
+    nsupc = SuperSize(jb);
+    iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
+    j++;
+    jj0++;
+    jj = iukp;
+
+    while (usub[jj] == klst) ++jj; /* Skip zero segments */
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+
+    /* This loop computes ldu. */
+    for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize > ldu)  ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+	    iam, k, jb, ldu, ncols, nsupc);
+    ++num_copy;
+#endif
+
+    /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+    tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+    for (jj = iukp; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            lead_zero = ldu - segsize;
+            for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+            tempu += lead_zero;
+            for (i = 0; i < segsize; ++i) {
+                tempu[i] = uval[rukp + i];
+            }
+            rukp += segsize;
+            tempu += segsize;
+        }
+    }
+    tempu = bigU; /* set back to the beginning of the buffer */
+
+    nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
+    // double ttx =SuperLU_timer_();
+
+    int current_b = 0; /* Each thread starts searching from first block.
+                          This records the moving search target.           */
+    lptr = lptr0; /* point to the start of index[] in supernode L(:,k) */
+    luptr = luptr0;
+
+#ifdef _OPENMP
+    /* Sherry -- examine all the shared variables ??
+       'firstprivate' ensures that the private variables are initialized
+       to the values before entering the loop.  */
+#pragma omp parallel for \
+    firstprivate(lptr,luptr,ib,current_b) private(lb) \
+    default(shared) schedule(dynamic)
+#endif
+    for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
+        int temp_nbrow; /* automatic variable is private */
+
+        /* Search for the L block that my thread will work on.
+           No need to search from 0, can continue at the point where
+           it is left from last iteration.
+           Note: Blocks may not be sorted in L. Different thread picks up
+	   different lb.   */
+        for (; current_b < lb; ++current_b) {
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;   /* move to next block */
+            luptr += temp_nbrow;  /* move to next block */
+        }
+
+#ifdef _OPENMP
+        int_t thread_id = omp_get_thread_num ();
+#else
+        int_t thread_id = 0;
+#endif
+        float * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread  = indirect + ldt * thread_id;
+        int *indirect2_thread = indirect2 + ldt * thread_id;
+        ib = lsub[lptr];        /* block number of L(i,k) */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+	/* assert (temp_nbrow <= nbrow); */
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+	/*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
+        /* calling gemm */
+	stat->ops[FACT] += 2.0 * (flops_t)temp_nbrow * ldu * ncols;
+#if defined (USE_VENDOR_BLAS)
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	}
+#endif
+        /* Now scattering the output. */
+        if (ib < jb) {    /* A(i,j) is in U. */
+            sscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            sscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread,
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+        ++current_b;         /* Move to next block. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadScatterTimer += tt_end - tt_start;
+	}
+#endif
+    } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
+
+    iukp += nsupc; /* Mov to block U(k,j+1) */
+
+    /* =========================================== *
+     * == factorize L(:,j) and send if possible == *
+     * =========================================== */
+    kk = jb; /* destination column that is just updated */
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+        /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+
+        double tt1 = SuperLU_timer_();
+
+        PSGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_() - tt1;
+
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Multicasts numeric values of L(:,kk) to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d, tags %d:%d \n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj,
+			SLU_MPI_TAG(0,kk0), SLU_MPI_TAG(1,kk0));
+#endif
+            }  /* end if ( ToSendR[lk][pj] != EMPTY ) */
+        } /* end for pj ... */
+    } /* end if( look_ahead[kk] == k0 && kcol == mycol ) */
+} /* end while j < nub and perm_u[j] 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +#include "superlu_sdefs.h" + +#ifdef GPU_ACC // enable GPU +#include "gpublas_utils.h" +// #include "mkl.h" +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NGPU_STREAMS 32 + +static +void check(gpuError_t result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n", + file, line, gpuGetErrorString(result), func); + + // Make sure we call GPU Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + float *bigV; + float *bigU; + float *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + float *Remain_L_buff; /* on GPU */ + float *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} sSCUbuf_gpu_t; + +/* Holds the L & U data structures on the GPU side */ +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + float *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + float *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + sSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS]; + float *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*gpu events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *ePCIeH2D; + gpuEvent_t *ePCIeD2H_Start; + gpuEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} sLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + sLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU + gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream; + gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS]; + int_t lastOffloadStream[MAX_NGPU_STREAMS]; + int_t nGPUStreams; + int* isNodeInMyGrid; + double acc_async_cost; +} ssluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int ssparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + sscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + sLUValSubBuf_t **LUvsbs, // size = num Look ahead + sdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + ssluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +int sinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + ssluGPU_t *sluGPU, + gridinfo_t *grid, + sLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int sreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + sLUstruct_t *LUstruct); + +extern int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT); +extern int ssendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, ssluGPU_t *sluGPU); +extern int ssendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, float* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, ssluGPU_t *sluGPU, HyP_t* HyP +); + +extern int sinitSluGPU3D_t( + ssluGPU_t *sluGPU, + sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int sSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + ssluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void sCopyLUToGPU3D (int* isNodeInMyGrid, sLocalLU_t *A_host, + ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, sLUValSubBuf_t*LUvsb, + sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + ssluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT); +extern int sfree_LUstruct_gpu (sLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(ssluGPU_t *sluGPU); + +extern void sPrint_matrix( char *desc, int_t m, int_t n, float *dA, int_t lda ); + +/*to print out various statistics*/ +void sprintGPUStats(sLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +#endif // matching: enable GPU diff --git a/SRC/smemory_dist.c b/SRC/smemory_dist.c new file mode 100644 index 00000000..8c9ef510 --- /dev/null +++ b/SRC/smemory_dist.c @@ -0,0 +1,286 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Memory utilities + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * 
+ */ + +#include "superlu_sdefs.h" + + +/* Variables external to this file */ +extern SuperLU_LU_stack_t stack; + + +void *suser_malloc_dist(int_t bytes, int_t which_end) +{ + void *buf; + + if ( SuperLU_StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) stack.array + stack.top1; + stack.top1 += bytes; + } else { + stack.top2 -= bytes; + buf = (char*) stack.array + stack.top2; + } + + stack.used += bytes; + return buf; +} + + +void suser_free_dist(int_t bytes, int_t which_end) +{ + if ( which_end == HEAD ) { + stack.top1 -= bytes; + } else { + stack.top2 += bytes; + } + stack.used -= bytes; +} + + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * 
+ */ +int_t sQuerySpace_dist(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + SuperLUStat_t *stat, superlu_dist_mem_usage_t *mem_usage) +{ + register int_t dword, gb, iword, k, nb, nsupers; + int_t *index, *xsup; + int iam, mycol, myrow; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + iword = sizeof(int_t); + dword = sizeof(float); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + mem_usage->for_lu = 0.; + + /* For L factor */ + nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ + for (k = 0; k < nb; ++k) { + gb = k * grid->npcol + mycol; /* Global block number. */ + if ( gb < nsupers ) { + index = Llu->Lrowind_bc_ptr[k]; + if ( index ) { + mem_usage->for_lu += (float) + ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); + mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); + } + } + } + + /* For U factor */ + nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ + for (k = 0; k < nb; ++k) { + gb = k * grid->nprow + myrow; /* Global block number. */ + if ( gb < nsupers ) { + index = Llu->Ufstnz_br_ptr[k]; + if ( index ) { + mem_usage->for_lu += (float)(index[2] * iword); + mem_usage->for_lu += (float)(index[1] * dword); + } + } + } + + /* Working storage to support factorization */ + mem_usage->total = mem_usage->for_lu; +#if 0 + mem_usage->total += + (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + + ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); + /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ + mem_usage->total += + (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); + mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; + k = CEILING( nsupers, grid->nprow ); + mem_usage->total += (float)(2 * k * iword); +#else + /*mem_usage->total += stat->current_buffer;*/ + mem_usage->total += stat->peak_buffer; + +#if ( PRNTlevel>=1 ) + if (iam==0) printf(".. sQuerySpace: peak_buffer %.2f (MB)\n", + stat->peak_buffer * 1.0e-6); +#endif +#endif + return 0; +} /* sQuerySpace_dist */ + + +/* + * Allocate storage for original matrix A + */ +void +sallocateA_dist(int_t n, int_t nnz, float **a, int_t **asub, int_t **xa) +{ + *a = (float *) floatMalloc_dist(nnz); + *asub = (int_t *) intMalloc_dist(nnz); + *xa = (int_t *) intMalloc_dist(n+1); +} + + +float *floatMalloc_dist(int_t n) +{ + float *buf; + buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float) ); + return (buf); +} + +float *floatCalloc_dist(int_t n) +{ + float *buf; + register int_t i; + float zero = 0.0; + buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float)); + if ( !buf ) return (buf); + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + +/*************************************** + * The following are from 3D code. + ***************************************/ + +double sgetLUMem(int_t nodeId, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + double memlu = 0.0; + gridinfo_t* grid = &(grid3d->grid2d); + sLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + // double** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( nodeId, grid ); + if (mycol == pc) + { + int_t ljb = LBj( nodeId, grid ); /* Local block number */ + int_t *lsub; + float* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t nrbl = lsub[0]; /*number of L blocks */ + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t len2 = SuperSize(nodeId) * len; + memlu += 1.0 * (len1 * sizeof(int_t) + len2 * sizeof(float)); + } + } + + int_t pr = PROW( nodeId, grid ); + if (myrow == pr) + { + int_t lib = LBi( nodeId, grid ); /* Local block number */ + int_t *usub; + // double* unzval; + usub = Ufstnz_br_ptr[lib]; + + if (usub != NULL) + { + int_t lenv = usub[1]; + int_t lens = usub[2]; + memlu += 1.0 * (lenv * sizeof(int_t) + lens * sizeof(float)); + } + } + return memlu; +} + +double smemForest(sForest_t*sforest, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + double memlu = 0; + + int_t *perm_c_supno = sforest->nodeList; + int_t nnodes = sforest->nNodes; + for (int i = 0; i < nnodes; ++i) + { + memlu += sgetLUMem(perm_c_supno[i], LUstruct, grid3d); + } + + return memlu; +} + +void s3D_printMemUse( trf3Dpartition_t* trf3Dpartition, sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d ) +{ + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + + double memNzLU = 0.0; + double memzLU = 0.0; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + { + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + if (sforest) + { + if (!myZeroTrIdxs[ilvl]) + { + memNzLU += smemForest(sforest, LUstruct, grid3d); + } + else + { + memzLU += smemForest(sforest, LUstruct, grid3d); + } + } + } + double sumMem = memNzLU + memzLU; + double maxMem, minMem, avgNzLU, avgzLU; + /*Now reduce it among all the procs*/ + MPI_Reduce(&sumMem, &maxMem, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm); + MPI_Reduce(&sumMem, &minMem, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm); + MPI_Reduce(&memNzLU, &avgNzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + MPI_Reduce(&memzLU, &avgzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + + int_t nProcs = grid3d->nprow * grid3d->npcol * grid3d->npdep; + if (!(grid3d->iam)) + { + /* code */ + printf("| Total Memory \t| %.2g \t| %.2g \t|%.2g \t|\n", (avgNzLU + avgzLU) / nProcs, maxMem, minMem ); + printf("| LU-LU(repli) \t| %.2g \t| %.2g \t|\n", (avgNzLU) / nProcs, avgzLU / nProcs ); + } +} + diff --git a/SRC/smyblas2_dist.c b/SRC/smyblas2_dist.c new file mode 100644 index 00000000..34281a54 --- /dev/null +++ b/SRC/smyblas2_dist.c @@ -0,0 +1,248 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Level 2 BLAS operations: solves and matvec, written in C + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +/* + * File name: smyblas2.c + * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + */ + +/*! \brief + * + *
+ * Solves a dense UNIT lower triangular system. The unit lower
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol).
+ * The solution will be returned in the rhs vector.
+ * 
+ */ +void slsolve ( int ldm, int ncol, float *M, float *rhs ) +{ + int k; + float x0, x1, x2, x3, x4, x5, x6, x7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + + M0 = &M[0]; + + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + Mki4 = Mki3 + ldm + 1; + Mki5 = Mki4 + ldm + 1; + Mki6 = Mki5 + ldm + 1; + Mki7 = Mki6 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++; + x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++; + x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; + x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + rhs[++firstcol] = x4; + rhs[++firstcol] = x5; + rhs[++firstcol] = x6; + rhs[++firstcol] = x7; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++ + - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++ - x7 * *Mki7++; + + M0 += 8 * ldm + 8; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++; + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; + + } + return; +} + +/*! \brief + * + *
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ * 
+ */ +void +susolve ( + int ldm, /* in */ + int ncol, /* in */ + float *M, /* in */ + float *rhs /* modified */ +) +{ + float xj; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) + rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ + + jcol--; + + } + return; +} + + +/*! \brief + * + *
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ * 
+ */ +void smatvec ( + int ldm, /* in -- leading dimension of M */ + int nrow, /* in */ + int ncol, /* in */ + float *M, /* in */ + float *vec, /* in */ + float *Mxvec /* in/out */ +) +{ + float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + int k; + + M0 = &M[0]; + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + Mki4 = Mki3 + ldm; + Mki5 = Mki4 + ldm; + Mki6 = Mki5 + ldm; + Mki7 = Mki6 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + vi4 = vec[firstcol++]; + vi5 = vec[firstcol++]; + vi6 = vec[firstcol++]; + vi7 = vec[firstcol++]; + + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ + + vi4 * *Mki4++ + vi5 * *Mki5++ + + vi6 * *Mki6++ + vi7 * *Mki7++; + + M0 += 8 * ldm; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ ; + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++; + + M0 += ldm; + } + return; +} + diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c new file mode 100644 index 00000000..5140cc24 --- /dev/null +++ b/SRC/snrformat_loc3d.c @@ -0,0 +1,575 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, float *Dst, int lddst, float *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void sGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ float *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = floatMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (float *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	float *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	float *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = floatMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = floatMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_FLOAT, B1,
+		    b_counts_int, b_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((float*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* sGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int sScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    float *B = (float *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    float *B2d = (float *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    float *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = floatMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    float *Btmp; // on 3D grid
+    Btmp = floatMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_FLOAT,
+		     Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_FLOAT,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* sScatter_B3d */
diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c
index 94db174a..ca97bc36 100644
--- a/SRC/sp_colorder.c
+++ b/SRC/sp_colorder.c
@@ -223,13 +223,13 @@ sp_colorder(superlu_dist_options_t *options,  SuperMatrix *A, int_t *perm_c,
 int
 check_perm_dist(char *what, int_t n, int_t *perm)
 {
-    register int_t i;
+    register int i;
     int_t          *marker;
     marker = (int_t *) intCalloc_dist(n);
 
     for (i = 0; i < n; ++i) {
 	if ( perm[i] >= n || marker[perm[i]] == 1 ) {
-	    printf("%s: Not a valid PERM[" IFMT "] = " IFMT "\n", 
+	    printf("%s: Not a valid PERM[%d] = " IFMT "\n", 
 		   what, i, perm[i]);
 	    ABORT("check_perm_dist");
 	} else {
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 08d1e8f1..e7ea44db 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -53,6 +53,10 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+            = 8: the maximum buffer size on GPU that can hold the "dC"
+	         matrix in the GEMM call for the Schur complement update.
+		 If this is too small, the Schur complement update will be
+		 done in multiple partitions, may be slower.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -62,13 +66,11 @@ at the top-level directory.
 
*/ - #include #include - -int_t -sp_ienv_dist(int_t ispec) +int +sp_ienv_dist(int ispec) { // printf(" this function called\n"); int i; @@ -91,16 +93,16 @@ sp_ienv_dist(int_t ispec) return(atoi(ttemp)); } else - return 20; + return 60; // 20 case 3: - ttemp = getenv("NSUP"); + ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h if(ttemp) { - return(atoi(ttemp)); + int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE ); + return (k); } - else - return 128; + else return 256; // 128; #endif case 6: @@ -110,8 +112,11 @@ sp_ienv_dist(int_t ispec) case 7: ttemp = getenv ("N_GEMM"); if (ttemp) return atoi (ttemp); - else return 10000; - + else return 100; // 10000; + case 8: + ttemp = getenv ("MAX_BUFFER_SIZE"); + if (ttemp) return atoi (ttemp); + else return 256000000; // 256000000 = 16000^2 } /* Invalid value for ISPEC */ @@ -119,6 +124,5 @@ sp_ienv_dist(int_t ispec) xerr_dist("sp_ienv", &i); return 0; - } /* sp_ienv_dist */ diff --git a/SRC/sreadMM.c b/SRC/sreadMM.c new file mode 100644 index 00000000..d2ac8c8e --- /dev/null +++ b/SRC/sreadMM.c @@ -0,0 +1,244 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief + * Contributed by Francois-Henry Rouet. + * + */ +#include +#include +#include "superlu_sdefs.h" + +#undef EXPAND_SYM + +/*! brief + * + *
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t j, k, jsize, nnz, nz, new_nonz; + float *a, *val; + int_t *asub, *xa, *row, *col; + int_t zero_base = 0; + char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64]; + int expand; + char *cs; + + /* File format: + * %%MatrixMarket matrix coordinate real general/symmetric/... + * % ... + * % (optional comments) + * % ... + * #rows #non-zero + * Triplet in the rest of lines: row col value + */ + + /* 1/ read header */ + cs = fgets(line,512,fp); + for (p=line; *p!='\0'; *p=tolower(*p),p++); + + if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) { + printf("Invalid header (first line does not contain 5 tokens)\n"); + exit(-1); + } + + if(strcmp(banner,"%%matrixmarket")) { + printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n"); + exit(-1); + } + + if(strcmp(mtx,"matrix")) { + printf("Not a matrix; this driver cannot handle that.\n"); + exit(-1); + } + + if(strcmp(crd,"coordinate")) { + printf("Not in coordinate format; this driver cannot handle that.\n"); + exit(-1); + } + + if(strcmp(arith,"real")) { + if(!strcmp(arith,"complex")) { + printf("Complex matrix; use zreadMM instead!\n"); + exit(-1); + } + else if(!strcmp(arith, "pattern")) { + printf("Pattern matrix; values are needed!\n"); + exit(-1); + } + else { + printf("Unknown arithmetic\n"); + exit(-1); + } + } + + if(strcmp(sym,"general")) { + printf("Symmetric matrix: will be expanded\n"); + expand=1; + } else + expand=0; + + /* 2/ Skip comments */ + while(banner[0]=='%') { + cs = fgets(line,512,fp); + sscanf(line,"%s",banner); + } + + /* 3/ Read n and nnz */ +#ifdef _LONGINT + sscanf(line, "%lld%lld%lld", m, n, nonz); +#else + sscanf(line, "%d%d%d",m, n, nonz); +#endif + + if(*m!=*n) { + printf("Rectangular matrix!. Abort\n"); + exit(-1); + } + + if(expand) + new_nonz = 2 * *nonz - *n; + else + new_nonz = *nonz; + + *m = *n; + printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz); + fflush(stdout); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = floatMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) intMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) intMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* 4/ Read triplets of values */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + + j = fscanf(fp, IFMT IFMT "%f\n", &row[nz], &col[nz], &val[nz]); + + if ( nnz == 0 ) /* first nonzero */ { + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else + printf("triplet file: row/col indices are one-based.\n"); + fflush(stdout); + } + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; + if(expand) { + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } + } + ++nz; + } + } + + *nonz = nz; + if(expand) { + printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz); + fflush(stdout); + } + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + + +static void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "sreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + i = fscanf(fp, "%lf\n", &b[i]); + /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/ + /* readpair_(j, &b[i]);*/ + fclose(fp); +} diff --git a/SRC/sreadhb.c b/SRC/sreadhb.c new file mode 100644 index 00000000..4476af93 --- /dev/null +++ b/SRC/sreadhb.c @@ -0,0 +1,389 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read a FLOAT PRECISION matrix stored in Harwell-Boeing format + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ +#include +#include +#include +#include "superlu_sdefs.h" + +/* + * Prototypes + */ +static void ReadVector(FILE *, int_t, int_t *, int_t, int_t); +static void sReadValues(FILE *, int_t, float *, int_t, int_t); +static void FormFullA(int_t, int_t *, float **, int_t **, int_t **); +static int DumpLine(FILE *); +static int ParseIntFormat(char *, int_t *, int_t *); +static int ParseFloatFormat(char *, int_t *, int_t *); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Read a FLOAT PRECISION matrix stored in Harwell-Boeing format
+ * as described below.
+ *
+ * Line 1 (A72,A8)
+ *  	Col. 1 - 72   Title (TITLE)
+ *	Col. 73 - 80  Key (KEY)
+ *
+ * Line 2 (5I14)
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD)
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD)
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD)
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD)
+ *                    (including starting guesses and solution vectors
+ *		       if present)
+ *           	      (zero indicates no right-hand side data is present)
+ *
+ * Line 3 (A3, 11X, 4I14)
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW)
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL)
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO)
+ *	              (equal to number of entries for assembled matrices)
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL)
+ *	              (zero in the case of assembled matrices)
+ * Line 4 (2A16, 2A20)
+ * 	Col. 1 - 16   Format for pointers (PTRFMT)
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT)
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT)
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT)
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present
+ *    	Col. 1 	      Right-hand side type:
+ *	         	  F for full storage or M for same format as matrix
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP)
+ *    	Col. 3        X if an exact solution vector(s) is supplied.
+ *	Col. 15 - 28  Number of right-hand sides (NRHS)
+ *	Col. 29 - 42  Number of row indices (NRHSIX)
+ *          	      (ignored in case of unassembled matrices)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *	R Real matrix
+ *	C Complex matrix
+ *	P Pattern only (no numerical values supplied)
+ *
+ * Second Character:
+ *	S Symmetric
+ *	U Unsymmetric
+ *	H Hermitian
+ *	Z Skew symmetric
+ *	R Rectangular
+ *
+ * Third Character:
+ *	A Assembled
+ *	E Elemental matrices (unassembled)
+ * 
+ */ + +void +sreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + + register int_t i, numer_lines, rhscrd = 0; + int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; + char buf[100], type[4]; + int_t sym; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Enter sreadhb_dist()"); +#endif + + /* Line 1 */ + fgets(buf, 100, fp); + + /* Line 2 */ + for (i=0; i<5; i++) { + fscanf(fp, "%14c", buf); buf[14] = 0; + tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ + if (i == 3) numer_lines = tmp; + if (i == 4 && tmp) rhscrd = tmp; + } + DumpLine(fp); + + /* Line 3 */ + fscanf(fp, "%3c", type); + fscanf(fp, "%11c", buf); /* pad */ + type[3] = 0; +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("Matrix type %s\n", type); +#endif + + fscanf(fp, "%14c", buf); *nrow = atoi(buf); + fscanf(fp, "%14c", buf); *ncol = atoi(buf); + fscanf(fp, "%14c", buf); *nonz = atoi(buf); + fscanf(fp, "%14c", buf); tmp = atoi(buf); + + if (tmp != 0) + if ( !iam ) printf("This is not an assembled matrix!\n"); + if (*nrow != *ncol) + if ( !iam ) printf("Matrix is not square.\n"); + DumpLine(fp); + + /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ + sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); + + /* Line 4: format statement */ + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &colnum, &colsize); + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &rownum, &rowsize); + fscanf(fp, "%20c", buf); + ParseFloatFormat(buf, &valnum, &valsize); + fscanf(fp, "%20c", buf); + DumpLine(fp); + + /* Line 5: right-hand side */ + if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */ + +#if ( DEBUGlevel>=1 ) + if ( !iam ) { + printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz); + printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize); + printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize); + printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize); + } +#endif + + ReadVector(fp, *ncol+1, *colptr, colnum, colsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]); +#endif + ReadVector(fp, *nonz, *rowind, rownum, rowsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read rowind[" IFMT "] = " IFMT "\n", *nonz-1, (*rowind)[*nonz-1]); +#endif + if ( numer_lines ) { + sReadValues(fp, *nonz, *nzval, valnum, valsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read nzval[" IFMT "] = %e\n", *nonz-1, (*nzval)[*nonz-1]); +#endif + } + + sym = (type[1] == 'S' || type[1] == 's'); + if ( sym ) { + FormFullA(*ncol, nonz, nzval, rowind, colptr); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Exit sreadhb_dist()"); +#endif +} + +/* Eat up the rest of the current line */ +static int DumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int ParseIntFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + *size = atoi(tmp); + return 0; +} + +static int ParseFloatFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); + + return 0; +} + +static void +ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) +{ + register int_t i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric + * matrix. On exit, it represents the full matrix with lower and upper parts. + *
+ */ +static void +FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, j, k, col, new_nnz; + int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr; + int_t *marker; + float *t_val, *al_val, *a_val; + + al_rowind = *rowind; + al_colptr = *colptr; + al_val = *nzval; + + if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for t_rowind[]"); + if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for t_val[]"); + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) + ++marker[al_rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose matrix A to T */ + for (j = 0; j < n; ++j) + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + col = al_rowind[i]; + t_rowind[marker[col]] = j; + t_val[marker[col]] = al_val[i]; + ++marker[col]; + } + + new_nnz = *nonz * 2 - n; + if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC a_colptr[]"); + if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for a_rowind[]"); + if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for a_val[]"); + + a_colptr[0] = 0; + k = 0; + for (j = 0; j < n; ++j) { + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + if ( t_rowind[i] != j ) { /* not diagonal */ + a_rowind[k] = t_rowind[i]; + a_val[k] = t_val[i]; +#if (DEBUGlevel >= 2) + if ( fabs(a_val[k]) < 4.047e-300 ) + printf("%5d: %e\n", k, a_val[k]); +#endif + ++k; + } + } + + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + a_rowind[k] = al_rowind[i]; + a_val[k] = al_val[i]; +#if (DEBUGlevel >= 2) + if ( fabs(a_val[k]) < 4.047e-300 ) + printf("%5d: %e\n", k, a_val[k]); +#endif + ++k; + } + + a_colptr[j+1] = k; + } + + printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k); + + SUPERLU_FREE(al_val); + SUPERLU_FREE(al_rowind); + SUPERLU_FREE(al_colptr); + SUPERLU_FREE(marker); + SUPERLU_FREE(t_val); + SUPERLU_FREE(t_rowind); + SUPERLU_FREE(t_colptr); + + *nzval = a_val; + *rowind = a_rowind; + *colptr = a_colptr; + *nonz = new_nnz; +} diff --git a/SRC/sreadrb.c b/SRC/sreadrb.c new file mode 100644 index 00000000..7165536e --- /dev/null +++ b/SRC/sreadrb.c @@ -0,0 +1,346 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file sreadrb.c + * \brief Read a matrix stored in Rutherford-Boeing format + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * 
+ * + * Purpose + * ======= + * + * Read a FLOAT PRECISION matrix stored in Rutherford-Boeing format + * as described below. + * + * Line 1 (A72, A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Matrix name / identifier (MTRXID) + * + * Line 2 (I14, 3(1X, I13)) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 16 - 28 Number of lines for pointers (PTRCRD) + * Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 44 - 56 Number of lines for numerical values (VALCRD) + * + * Line 3 (A3, 11X, 4(1X, I13)) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Compressed Column: Number of rows (NROW) + * Elemental: Largest integer used to index variable (MVAR) + * Col. 30 - 42 Compressed Column: Number of columns (NCOL) + * Elemental: Number of element matrices (NELT) + * Col. 44 - 56 Compressed Column: Number of entries (NNZERO) + * Elemental: Number of variable indeces (NVARIX) + * Col. 58 - 70 Compressed Column: Unused, explicitly zero + * Elemental: Number of elemental matrix entries (NELTVL) + * + * Line 4 (2A16, A20) + * Col. 1 - 16 Fortran format for pointers (PTRFMT) + * Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Fortran format for numerical values of coefficient matrix + * (VALFMT) + * (blank in the case of matrix patterns) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * I integer matrix + * P Pattern only (no numerical values supplied) + * Q Pattern only (numerical values supplied in associated auxiliary value + * file) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Compressed column form + * E Elemental form + * + *
+ */ + +#include +#include +#include "superlu_sdefs.h" + +/*! \brief Eat up the rest of the current line */ +static int DumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int ParseIntFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + *size = atoi(tmp); + return 0; +} + +static int ParseFloatFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) +{ + register int_t i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric + * matrix. On exit, it represents the full matrix with lower and upper parts. + *
+ */ +static void +FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, j, k, col, new_nnz; + int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr; + int_t *marker; + float *t_val, *al_val, *a_val; + + al_rowind = *rowind; + al_colptr = *colptr; + al_val = *nzval; + + if ( !(marker = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for t_rowind[]"); + if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for t_val[]"); + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) + ++marker[al_rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose matrix A to T */ + for (j = 0; j < n; ++j) + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + col = al_rowind[i]; + t_rowind[marker[col]] = j; + t_val[marker[col]] = al_val[i]; + ++marker[col]; + } + + new_nnz = *nonz * 2 - n; + if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC a_colptr[]"); + if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for a_rowind[]"); + if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for a_val[]"); + + a_colptr[0] = 0; + k = 0; + for (j = 0; j < n; ++j) { + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + if ( t_rowind[i] != j ) { /* not diagonal */ + a_rowind[k] = t_rowind[i]; + a_val[k] = t_val[i]; + ++k; + } + } + + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + a_rowind[k] = al_rowind[i]; + a_val[k] = al_val[i]; + ++k; + } + + a_colptr[j+1] = k; + } + + printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k); + + SUPERLU_FREE(al_val); + SUPERLU_FREE(al_rowind); + SUPERLU_FREE(al_colptr); + SUPERLU_FREE(marker); + SUPERLU_FREE(t_val); + SUPERLU_FREE(t_rowind); + SUPERLU_FREE(t_colptr); + + *nzval = a_val; + *rowind = a_rowind; + *colptr = a_colptr; + *nonz = new_nnz; +} + +void +sreadrb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, numer_lines = 0; + int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; + char buf[100], type[4]; + int sym; + + /* Line 1 */ + fgets(buf, 100, fp); + fputs(buf, stdout); + + /* Line 2 */ + for (i=0; i<4; i++) { + fscanf(fp, "%14c", buf); buf[14] = 0; + tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ + if (i == 3) numer_lines = tmp; + } + DumpLine(fp); + + /* Line 3 */ + fscanf(fp, "%3c", type); + fscanf(fp, "%11c", buf); /* pad */ + type[3] = 0; +#if (DEBUGlevel >= 1) + if ( !iam ) printf("Matrix type %s\n", type); +#endif + + fscanf(fp, "%14c", buf); *nrow = atoi(buf); + fscanf(fp, "%14c", buf); *ncol = atoi(buf); + fscanf(fp, "%14c", buf); *nonz = atoi(buf); + fscanf(fp, "%14c", buf); tmp = atoi(buf); + + if (tmp != 0) + if ( !iam ) printf("This is not an assembled matrix!\n"); + if (*nrow != *ncol) + if ( !iam ) printf("Matrix is not square.\n"); + DumpLine(fp); + + /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ + sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); + + /* Line 4: format statement */ + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &colnum, &colsize); + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &rownum, &rowsize); + fscanf(fp, "%20c", buf); + ParseFloatFormat(buf, &valnum, &valsize); + DumpLine(fp); + +#if (DEBUGlevel >= 1) + if ( !iam ) { + printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz); + printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize); + printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize); + printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize); + } +#endif + + ReadVector(fp, *ncol+1, *colptr, colnum, colsize); + ReadVector(fp, *nonz, *rowind, rownum, rowsize); + if ( numer_lines ) { + sReadValues(fp, *nonz, *nzval, valnum, valsize); + } + + sym = (type[1] == 'S' || type[1] == 's'); + if ( sym ) { + FormFullA(*ncol, nonz, nzval, rowind, colptr); + } + +} diff --git a/SRC/sreadtriple.c b/SRC/sreadtriple.c new file mode 100644 index 00000000..7e10f4ac --- /dev/null +++ b/SRC/sreadtriple.c @@ -0,0 +1,181 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief + * + */ +#include +#include "superlu_sdefs.h" + +#undef EXPAND_SYM + +/*! brief + * + *
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t j, k, jsize, nnz, nz, new_nonz; + float *a, *val; + int_t *asub, *xa, *row, *col; + int_t zero_base = 0; + + /* File format: + * First line: #rows #non-zero + * Triplet in the rest of lines: + * row col value + */ + +#ifdef _LONGINT + fscanf(fp, "%lld%lld%lld", m, n, nonz); +#else + fscanf(fp, "%d%d%d", m, n, nonz); +#endif + +#ifdef EXPAND_SYM + new_nonz = 2 * *nonz - *n; +#else + new_nonz = *nonz; +#endif + *m = *n; + printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + +#ifdef _LONGINT + fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]); +#else // int + fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); +#endif + + if ( nnz == 0 ) /* first nonzero */ + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else { + printf("triplet file: row/col indices are one-based.\n"); + } + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; +#ifdef EXPAND_SYM + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } +#endif + ++nz; + } + } + + *nonz = nz; +#ifdef EXPAND_SYM + printf("new_nonz after symmetric expansion:\t%d\n", *nonz); +#endif + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + + +void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "sreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%f\n", &b[i]); + /* readpair_(j, &b[i]);*/ + + fclose(fp); +} + + diff --git a/SRC/sreadtriple_noheader.c b/SRC/sreadtriple_noheader.c new file mode 100644 index 00000000..32905984 --- /dev/null +++ b/SRC/sreadtriple_noheader.c @@ -0,0 +1,199 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief + * + */ +#include +#include "superlu_sdefs.h" + +#undef EXPAND_SYM + +/*! brief + * + *
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t i, j, k, jsize, lasta, nnz, nz, new_nonz, minn = 100; + float *a, *val, vali; + int_t *asub, *xa, *row, *col; + int zero_base = 0, ret_val = 0; + + /* File format: Triplet in a line for each nonzero entry: + * row col value + * or row col real_part imaginary_part + */ + + /* First pass: determine N and NNZ */ + nz = *n = 0; + +#ifdef _LONGINT + ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali); +#else // int + ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); +#endif + + while (ret_val != EOF) { + *n = SUPERLU_MAX(*n, i); + *n = SUPERLU_MAX(*n, j); + minn = SUPERLU_MIN(minn, i); + minn = SUPERLU_MIN(minn, j); + ++nz; + +#ifdef _LONGINT + ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali); +#else // int + ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); +#endif + } + + if ( minn == 0 ) { /* zero-based indexing */ + zero_base = 1; + ++(*n); + printf("triplet file: row/col indices are zero-based.\n"); + } else { + printf("triplet file: row/col indices are one-based.\n"); + } + + *m = *n; + *nonz = nz; + rewind(fp); + +#ifdef EXPAND_SYM + new_nonz = 2 * *nonz - *n; +#else + new_nonz = *nonz; +#endif + + /* Second pass: read the actual matrix values */ + printf("m %ld, n %ld, nonz %ld\n", (long int) *m, (long int) *n, (long int) *nonz); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { +#ifdef _LONGINT + fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]); +#else // int32 + fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); +#endif + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz" IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; +#ifdef EXPAND_SYM + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } +#endif + ++nz; + } + } + + *nonz = nz; +#ifdef EXPAND_SYM + printf("new_nonz after symmetric expansion:\t%d\n", *nonz); +#endif + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + +#if 0 +void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i, j; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "zreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%f\n", &b[i]); + + fclose(fp); +} +#endif + diff --git a/SRC/sscatter.c b/SRC/sscatter.c new file mode 100644 index 00000000..3fc90962 --- /dev/null +++ b/SRC/sscatter.c @@ -0,0 +1,524 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Scatter the computed blocks into LU destination. + * + *
+ * -- Distributed SuperLU routine (version 6.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 18, 2017, enable SIMD vectorized scatter operation.
+ *
+ */
+#include 
+#include "superlu_sdefs.h"
+
+void
+sscatter_l_1 (int ib,
+           int ljb,
+           int nsupc,
+           int_t iukp,
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int * usub,
+           int * lsub,
+           float *tempv,
+           int * indirect_thread,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+	   gridinfo_t * grid)
+{
+    // TAU_STATIC_TIMER_START("SCATTER_LB");
+    // printf("hello\n");
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    while (ijb != ib)
+    {
+        /* Search for dest block --
+           blocks are not ordered! */
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+
+        ijb = index[lptrj];
+    }
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    lptrj += LB_DESCRIPTOR;
+    for (i = 0; i < index[lptrj - 1]; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    // tempv =bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        // printf("segsize %d \n",segsize);
+        if (segsize) {
+            /*#pragma _CRI cache_bypass nzval,tempv */
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                nzval[indirect_thread[rel]] -= tempv[i];
+                // printf("i (src) %d, perm (dest) %d  \n",i,indirect_thread[rel]);
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                // if(!(*(long*)&zz == *(long*)&tempv[i]) )
+                printf ("(%d %d, %0.3e, %0.3e, %3e ) ", ljb,
+                        nzval - Lnzval_bc_ptr[ljb] + indirect_thread[rel],
+                        nzval[indirect_thread[rel]] + tempv[i],
+                        nzval[indirect_thread[rel]],tempv[i]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }
+            // printf("\n");
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }
+        nzval += ldv;
+        // printf("%d\n",nzval );
+    }
+    // TAU_STATIC_TIMER_STOP("SCATTER_LB");
+} /* sscatter_l_1 */
+
+void
+sscatter_l (
+           int ib,    /* row block number of source block L(i,k) */
+           int ljb,   /* local column block number of dest. block L(i,j) */
+           int nsupc, /* number of columns in destination supernode */
+           int_t iukp, /* point to destination supernode's index[] */
+           int_t* xsup,
+           int klst,
+           int nbrow,  /* LDA of the block in tempv[] */
+           int_t lptr, /* Input, point to index[] location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* usub,
+           int_t* lsub,
+           float *tempv,
+           int* indirect_thread,int* indirect2,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+           gridinfo_t * grid)
+{
+
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the destination lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)  /* Search for destination block L(i,j) */
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+    /*
+     * Build indirect table. This is needed because the indices are not sorted
+     * in the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow=index[lptrj - 1];
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    for (i = 0; i < dest_nbrow; ++i) {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    /* can be precalculated? */
+    for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] =indirect_thread[rel];
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* sscatter_l */
+
+
+void
+sscatter_u (int ib,
+           int jb,
+           int nsupc,
+           int_t iukp,
+           int_t * xsup,
+           int klst,
+ 	   int nbrow,      /* LDA of the block in tempv[] */
+           int_t lptr,     /* point to index location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* lsub,
+           int_t* usub,
+           float* tempv,
+           int_t ** Ufstnz_br_ptr, float **Unzval_br_ptr,
+           gridinfo_t * grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    // TAU_STATIC_TIMER_START("SCATTER_U");
+    // TAU_STATIC_TIMER_START("SCATTER_UB");
+
+    int_t jj, i, fnz, rel;
+    int segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* Reinitilize the pointers to the beginning of the k-th column/row of
+     * L/U factors.
+     * usub[] - index array for panel U(k,:)
+     */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb) {   /* Search for destination block. */
+        ruip_lib += index[iuip_lib + 1];
+        // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor. Now point to fstnz index of block U(i,j). */
+    iuip_lib += UB_DESCRIPTOR;
+
+    // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        if (segsize) {          /* Nonzero segment in U(k,j). */
+            ucol = &Unzval_br_ptr[lib][ruip_lib];
+
+            // printf("========Entering loop=========\n");
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
+                // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
+                ucol[rel] -= tempv[i];
+
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                if (!(*(long *) &zz == *(long *) &tempv[i]))
+                    printf ("(%d, %0.3e, %0.3e ) ", rel, ucol[rel] + tempv[i],
+                            ucol[rel]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            } /* for i = 0:temp_nbropw */
+            tempv += nbrow; /* Jump LDA to next column */
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }  /* if segsize */
+
+        ruip_lib += ilst - fnz;
+
+    }  /* for jj = 0:nsupc */
+#ifdef PI_DEBUG
+    // printf("\n");
+#endif
+    // TAU_STATIC_TIMER_STOP("SCATTER_UB");
+} /* sscatter_u */
+
+
+/*Divide CPU-GPU dgemm work here*/
+#ifdef PI_DEBUG
+int Ngem = 2;
+// int_t Ngem = 0;
+int min_gpu_col = 6;
+#else
+
+    // int_t Ngem = 0;
+
+#endif
+
+
+#if 0 // Sherry: moved and corrected in util.c 
+#ifdef GPU_ACC
+
+void
+gemm_division_cpu_gpu(
+    int* num_streams_used,  /*number of streams that will be used */
+    int* stream_end_col,    /*array holding last column blk for each partition */
+    int * ncpu_blks,        /*Number of CPU dgemm blks */
+    /*input */
+    int nbrow,              /*number of row in A matrix */
+    int ldu,                /*number of k in dgemm */
+    int nstreams,
+    int* full_u_cols,       /*array containing prefix sum of work load */
+    int num_blks            /*Number of work load */
+)
+{
+    int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_gpublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int i, j;
+
+    // {
+    //     *num_streams_used=0;
+    //     *ncpu_blks = num_blks;
+    //     return;
+    // }
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (full_u_cols[num_blks - 1] < (Ngem / (nbrow * ldu)) || num_blks == 1 )
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+#ifdef PI_DEBUG
+        printf ("full_u_cols[num_blks-1] %d  %d \n",
+                full_u_cols[num_blks - 1], (Ngem / (nbrow * ldu)));
+        printf ("Early return \n");
+#endif
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (full_u_cols[i + 1] > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+        full_u_cols[num_blks - 1] - full_u_cols[*ncpu_blks - 1];
+
+#ifdef PI_DEBUG
+    printf ("Remaining cols %d num_blks %d cpu_blks %d \n", cols_remain,
+            num_blks, *ncpu_blks);
+#endif
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+#ifdef PI_DEBUG
+        printf ("%d %d  %d %d \n", full_u_cols[num_blks - 1],
+                full_u_cols[*ncpu_blks], *ncpu_blks, nstreams);
+#endif
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+#ifdef PI_DEBUG
+        printf ("cols_per_stream :\t%d\n", cols_per_stream);
+#endif
+
+        int_t cutoff = cols_per_stream + full_u_cols[*ncpu_blks - 1];
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+#ifdef PI_DEBUG
+                printf ("i %d, j %d, %d  %d ", i, j, full_u_cols[j + 1],
+                        cutoff);
+#endif
+                if (full_u_cols[j + 1] > cutoff)
+                {
+#ifdef PI_DEBUG
+                    printf ("cutoff met \n");
+#endif
+                    cutoff = cols_per_stream + full_u_cols[j];
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+#ifdef PI_DEBUG
+                printf ("\n");
+#endif
+            }
+
+        }
+
+    }
+}
+
+void
+gemm_division_new (int * num_streams_used,   /*number of streams that will be used */
+                   int * stream_end_col, /*array holding last column blk for each partition */
+                   int * ncpu_blks,  /*Number of CPU dgemm blks */
+                        /*input */
+                   int nbrow,    /*number of row in A matrix */
+                   int ldu,  /*number of k in dgemm */
+                   int nstreams,
+                   Ublock_info_t *Ublock_info,    /*array containing prefix sum of work load */
+                   int num_blks  /*Number of work load */
+    )
+{
+    int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_gpublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int_t i, j;
+
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (Ublock_info[num_blks - 1].full_u_cols < (Ngem / (nbrow * ldu)) || num_blks == 1)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (Ublock_info[i + 1].full_u_cols > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+       Ublock_info [num_blks - 1].full_u_cols - Ublock_info[*ncpu_blks - 1].full_u_cols;
+
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+
+        int_t cutoff = cols_per_stream + Ublock_info[*ncpu_blks - 1].full_u_cols;
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+                if (Ublock_info[j + 1].full_u_cols > cutoff)
+                {
+
+                    cutoff = cols_per_stream + Ublock_info[j].full_u_cols;
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+
+            }
+
+        }
+
+    }
+}
+
+#endif  /* defined GPU_ACC */
+
+#endif // comment out the above code 
diff --git a/SRC/sscatter3d.c b/SRC/sscatter3d.c
new file mode 100644
index 00000000..c43a7460
--- /dev/null
+++ b/SRC/sscatter3d.c
@@ -0,0 +1,625 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, float *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+sblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    float *L_mat, int ldl,
+                    float *U_mat, int ldu,
+                    float *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    float alpha = 1.0, beta = 0.0;
+
+    /* calling SGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        sscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+sblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         float *L_mat, int_t ldl,
+                         float *U_mat, int_t ldu,
+                         float *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling SGEMM */
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        sscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t sblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 float* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 sLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sgemm_scatterTopLeft */
+
+int_t sblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  sLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sblock_gemm_scatterTopRight */
+
+int_t sblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    sLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomLeft */
+
+int_t sblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     sLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           float *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/ssp_blas2_dist.c b/SRC/ssp_blas2_dist.c
new file mode 100644
index 00000000..279b97e8
--- /dev/null
+++ b/SRC/ssp_blas2_dist.c
@@ -0,0 +1,501 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Sparse BLAS 2, using some dense BLAS 2 operations
+ *
+ * 
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +/* + * File name: ssp_blas2_dist.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include "superlu_sdefs.h" + + +/* + * Function prototypes + */ +#ifndef USE_VENDOR_BLAS +extern void susolve(int, int, float*, float*); +extern void slsolve(int, int, float*, float*); +extern void smatvec(int, int, int, float*, float*, float*); +#endif + +/*! \brief + * + *
+ *   Purpose
+ *   =======
+ *
+ *   sp_strsv_dist() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.   
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes, i.e.,
+ *             L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
+ *    
+ *   x       - (input/output) float*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * 
+ */
+int
+sp_strsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+	      SuperMatrix *U, float *x, int *info)
+{
+
+#ifdef _CRAY
+    _fcd ftcs1, ftcs2, ftcs3;
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    float   *Lval, *Uval;
+    int incx = 1, incy = 1;
+    float alpha = 1.0, beta = 1.0;
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    float *work;
+    flops_t solve_ops;
+    /*extern SuperLUStat_t SuperLUStat;*/
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U",1) !=0 ) *info = -1;
+    else if ( strncmp(trans, "N",1) !=0 && strncmp(trans, "T", 1) !=0 )
+	*info = -2;
+    else if ( strncmp(diag, "U", 1) !=0 && strncmp(diag, "N", 1) != 0 )
+	*info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerr_dist("sp_strsv_dist", &i);
+	return 0;
+    }
+
+    Lstore = (SCformat *) L->Store;
+    Lval = (float *) Lstore->nzval;
+    Ustore = (NCformat *) U->Store;
+    Uval = (float *) Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = floatCalloc_dist(L->nrow)) )
+	ABORT("Malloc fails for work in sp_dtrsv_dist().");
+    
+    if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = SuperLU_L_FST_SUPC(k);
+		istart = SuperLU_L_SUB_START(fsupc);
+		nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+		nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+		luptr = SuperLU_L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+	        solve_ops += nsupc * (nsupc - 1);
+	        solve_ops += 2 * nrow * nsupc;
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < SuperLU_L_SUB_START(fsupc+1); ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			++luptr;
+			x[irow] -= x[fsupc] * Lval[luptr];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("L", strlen("L"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx, 1, 1, 1);
+		
+		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
+#endif /* _CRAY */		
+#else
+		    slsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    smatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+			&x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			x[irow] -= work[i];	/* Scatter */
+			work[i] = 0.0;
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+    	        solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		    for (i = SuperLU_U_NZ_START(fsupc); i < SuperLU_U_NZ_START(fsupc+1); ++i) {
+			irow = SuperLU_U_SUB(i);
+			x[irow] -= x[fsupc] * Uval[i];
+		    }
+
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("U", strlen("U"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else		
+		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    	for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); 
+				i++) {
+			    irow = SuperLU_U_SUB(i);
+			    x[irow] -= x[jcol] * Uval[i];
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	istart = SuperLU_L_SUB_START(fsupc);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		solve_ops += 2 * (nsupr - nsupc) * nsupc;
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = SuperLU_L_NZ_START(jcol) + nsupc; 
+				i < SuperLU_L_NZ_START(jcol+1); i++) {
+			irow = SuperLU_L_SUB(iptr);
+			x[jcol] -= x[irow] * Lval[i];
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += nsupc * (nsupc - 1);
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); i++) {
+			irow = SuperLU_U_SUB(i);
+			x[jcol] -= x[irow] * Uval[i];
+		    }
+		}
+
+		solve_ops += nsupc * (nsupc + 1);
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
+    SUPERLU_FREE(work);
+    return 0;
+} /* sp_strsv_dist */
+
+
+/*! \brief SpGEMV
+
+  Purpose   
+    =======   
+
+    sp_sgemv_dist()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) double
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
+
+    X      - (input) float*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) float*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+
+*/ +int +sp_sgemv_dist(char *trans, float alpha, SuperMatrix *A, float *x, + int incx, float beta, float *y, int incy) +{ + + /* Local variables */ + NCformat *Astore; + float *Aval; + int info; + float temp, temp1; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + float zero = 0.0; + float one = 1.0; + + notran = (strncmp(trans, "N", 1)==0); + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && strncmp(trans, "T", 1) !=0 && strncmp(trans, "C", 1) != 0) + info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerr_dist("sp_sgemv_dist ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.) + return 0; + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if ( strncmp(trans, "N", 1)==0 ) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if (beta != 1.) { + if (incy == 1) { + if (beta == 0.) + for (i = 0; i < leny; ++i) y[i] = zero; + else + for (i = 0; i < leny; ++i) y[i] = beta * y[i]; + } else { + iy = ky; + if (beta == 0.) + for (i = 0; i < leny; ++i) { + y[iy] = zero; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + y[iy] = beta * y[iy]; + iy += incy; + } + } + } + + if (alpha == 0.) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if (x[jx] != 0.) { + temp = alpha * x[jx]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + y[irow] += temp * Aval[i]; + } + } + jx += incx; + } + } else { + ABORT("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = zero; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp += Aval[i] * x[irow]; + } + y[jy] += alpha * temp; + jy += incy; + } + } else { + ABORT("Not implemented."); + } + } + return 0; +} /* sp_sgemv_dist */ diff --git a/SRC/ssp_blas3_dist.c b/SRC/ssp_blas3_dist.c new file mode 100644 index 00000000..4bf00e01 --- /dev/null +++ b/SRC/ssp_blas3_dist.c @@ -0,0 +1,138 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +/* + * File name: ssp_blas3_dist.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include "superlu_sdefs.h" + +/*! \brief + +
+  Purpose   
+    =======   
+
+    sp_sgemm_dist performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) float
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    B      - float array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - float array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.  
+
+*/ +int +sp_sgemm_dist(char *transa, int n, float alpha, SuperMatrix *A, + float *b, int ldb, float beta, + float *c, int ldc) +{ + + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_sgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/SRC/sstatic_schedule.c b/SRC/sstatic_schedule.c new file mode 100644 index 00000000..ebb416f5 --- /dev/null +++ b/SRC/sstatic_schedule.c @@ -0,0 +1,984 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs static scheduling for the look-ahead factorization algorithm. + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified: February 20, 2020, changed to be precision-dependent.
+ *
+ * Reference:
+ * 
+ * 
+ */ + +#include "superlu_sdefs.h" + +#ifdef ISORT +extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); +extern void isort1 (int_t N, int_t * ARRAY); + +#else + +static int +superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + +int +sstatic_schedule(superlu_dist_options_t * options, int m, int n, + sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, + int_t *perm_c_supno, int_t *iperm_c_supno, int *info) +{ +/* + * Arguments + * ========= + * + * perm_c_supno (output) + * perm_c_supno[k] = j means at the k-th step of elimination, the j-th + * supernode is chosen. + * iperm_c_supno (output), inverse of perm_c_supno[] + * iperm_c_supno[j] = k means the j-th supernode will be scheduled + * at the k-th step of elimination. + * + */ + int_t *xsup; + int_t i, ib, jb, lb, nlb, il, iu; + int_t Pc, Pr; + int iam, krow, yourcol, mycol, myrow; + int j, k, nsupers; /* k - current panel to work on */ + int_t *index; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int ncb, nrb, p, pr, pc, nblocks; + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, + *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l, + *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno; + float edag_supno_l_bytes; + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + etree_node *head, *tail, *ptr; + int *num_child; + + int iword = sizeof (int_t); + + /* Test the input parameters. */ + *info = 0; + if (m < 0) *info = -2; + else if (n < 0) *info = -3; + if (*info) { + pxerr_dist ("static_schedule", grid, -*info); + return (-1); + } + + /* Quick return if possible. */ + if (m == 0 || n == 0) return 0; + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + mycol = MYCOL (iam, grid); + nsupers = Glu_persist->supno[n - 1] + 1; + xsup = Glu_persist->xsup; + nblocks = 0; + ncb = nsupers / Pc; + nrb = nsupers / Pr; + +#if ( DEBUGlevel >= 1 ) + print_memorylog(stat, "before static schedule"); +#endif + + /* ================================================== * + * static scheduling of j-th step of LU-factorization * + * ================================================== */ + if (options->lookahead_etree == YES && /* use e-tree of symmetrized matrix and */ + (options->ParSymbFact == NO || /* 1) symmetric fact with serial symbolic, or */ + (options->SymPattern == YES && /* 2) symmetric pattern, and */ + options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */ + + /* if symmetric pattern or using e-tree of |A^T|+|A|, + then we can use a simple tree structure for static schduling */ + + if (options->ParSymbFact == NO) { + /* Use the etree computed from serial symb. fact., and turn it + into supernodal tree. */ + int_t *etree = LUstruct->etree; +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using column e-tree ===\n"); +#endif + + /* look for the first off-diagonal blocks */ + etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + log_memory(nsupers * iword, stat); + + for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers; + + for (j = 0, lb = 0; lb < nsupers; lb++) { + for (k = 0; k < SuperSize (lb); k++) { + jb = Glu_persist->supno[etree[j + k]]; + if (jb != lb) + etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb); + } + j += SuperSize (lb); + } + } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */ + /* Compute an "etree" based on struct(L), + assuming struct(U) = struct(L'). */ +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using supernodal e-tree ===\n"); +#endif + + /* find the first block in each supernodal-column of local L-factor */ + etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + log_memory(nsupers * iword, stat); + + for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers; + for (lb = 0; lb < ncb; lb++) { + jb = lb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if (i > 0) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for (j = 0; j < i; j++) + { + etree_supno_l[jb] = + SUPERLU_MIN (etree_supno_l[jb], index[k]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) { + jb = ncb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if (i > 0) { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for (j = 0; j < i; j++) { + etree_supno_l[jb] = + SUPERLU_MIN (etree_supno_l[jb], index[k]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + + /* form global e-tree */ + etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + + MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t, + MPI_MIN, grid->comm); + + SUPERLU_FREE (etree_supno_l); + } + + /* initialize number of children for each node */ + num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + for (i = 0; i < nsupers; i++) num_child[i] = 0; + for (i = 0; i < nsupers; i++) + if (etree_supno[i] != nsupers) num_child[etree_supno[i]]++; + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for (i = 0; i < nsupers; i++) { + if (num_child[i] == 0) { + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ + nnodes++; + + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + + while (nnodes > 0) { + ptr = head; + j = ptr->id; + head = ptr->next; + perm_c_supno[i] = j; + SUPERLU_FREE (ptr); + i++; + nnodes--; + + if (etree_supno[j] != nsupers) { + num_child[etree_supno[j]]--; + if (num_child[etree_supno[j]] == 0) { + nnodes++; + + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = etree_supno[j]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id ); */ + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" ); */ + } + SUPERLU_FREE (num_child); + SUPERLU_FREE (etree_supno); + log_memory(-2 * nsupers * iword, stat); + + } else { /* Unsymmetric pattern */ + + /* Need to process both L- and U-factors, use the symmetrically + pruned graph of L & U instead of tree (very naive implementation) */ + int nrbp1 = nrb + 1; + float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes; + + /* allocate some workspace */ + if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int)))) + ABORT ("Malloc fails for sendcnts[]."); + log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat); + + sdispls = &sendcnts[Pr * Pc]; + recvcnts = &sdispls[Pr * Pc]; + rdispls = &recvcnts[Pr * Pc]; + srows = &rdispls[Pr * Pc]; + rrows = &srows[Pr * Pc * nrbp1]; + + myrow = MYROW (iam, grid); +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using DAG ===\n"); +#endif + + /* send supno block of local U-factor to a processor * + * who owns the corresponding block of L-factor */ + + /* srows : # of block to send to a processor from each supno row */ + /* sendcnts: total # of blocks to send to a processor */ + for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0; + for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0; + + /* sending blocks of U-factors corresponding to L-factors */ + /* count the number of blocks to send */ + for (lb = 0; lb < nrb; ++lb) { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p]++; + srows[p * nrbp1 + lb]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (myrow < nsupers % grid->nprow) { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p]++; + srows[p * nrbp1 + nrb]++; + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + /* insert blocks to send */ + sdispls[0] = 0; + for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + if (!(blocks = intMalloc_dist (nblocks))) + ABORT ("Malloc fails for blocks[]."); + log_memory( nblocks * iword, stat ); + + for (lb = 0; lb < nrb; ++lb) { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (myrow < nsupers % grid->nprow) { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + /* communication */ + MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm); + + log_memory( -(nblocks * iword), stat ); /* blocks[] to be freed soon */ + + nblocks = recvcnts[0]; + rdispls[0] = sdispls[0] = 0; + for (p = 1; p < Pr * Pc; p++) { + rdispls[p] = rdispls[p - 1] + recvcnts[p - 1]; + sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + nblocks += recvcnts[p]; + } + + if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[]."); + log_memory( nblocks * iword, stat ); + + MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, + rdispls, mpi_int_t, grid->comm); + + SUPERLU_FREE (blocks); /* memory logged before */ + + + /* store the received U-blocks by rows */ + nlb = nsupers / Pc; + if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[]."); + if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[]."); + + Ublock_bytes = nblocks * iword; + Urows_bytes = (1 + nlb) * iword; + log_memory( Ublock_bytes + Urows_bytes, stat ); + + k = 0; + for (jb = 0; jb < nlb; jb++) { + j = jb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[jb] = 0; + + for (pc = 0; pc < Pc; pc++) { + p = pr * Pc + pc; /* the processor owning this block of U-factor */ + + for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; + i++) { + Ublock[k] = blockr[i]; + k++; + Urows[jb]++; + } + rdispls[p] += rrows[p * nrbp1 + lb]; + } + /* sort by the column indices to make things easier for later on */ + +#ifdef ISORT + isort1 (Urows[jb], &(Ublock[k - Urows[jb]])); +#else + qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + if (mycol < nsupers % grid->npcol) { + j = nlb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[nlb] = 0; + + for (pc = 0; pc < Pc; pc++) { + p = pr * Pc + pc; + for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; + i++) { + Ublock[k] = blockr[i]; + k++; + Urows[nlb]++; + } + rdispls[p] += rrows[p * nrb + lb]; + } +#ifdef ISORT + isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]])); +#else + qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + SUPERLU_FREE (blockr); + log_memory( -nblocks * iword, stat ); + + /* sort the block in L-factor */ + nblocks = 0; + for (lb = 0; lb < ncb; lb++) { + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + nblocks += index[0]; + } + } + if (mycol < nsupers % grid->npcol) { + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + nblocks += index[0]; + } + } + + if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[]."); + if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[]."); + + Lblock_bytes = nblocks * iword; + Lrows_bytes = (1 + ncb) * iword; + log_memory(Lblock_bytes + Lrows_bytes, stat); + + for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0; + nblocks = 0; + for (lb = 0; lb < ncb; lb++) { + Lrows[lb] = 0; + + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for (j = 0; j < i; j++) { + Lblock[nblocks] = index[k]; + Lrows[lb]++; + nblocks++; + + k += LB_DESCRIPTOR + index[k + 1]; + } + } +#ifdef ISORT + isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]])); +#else + qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + if (mycol < nsupers % grid->npcol) { + Lrows[ncb] = 0; + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for (j = 0; j < i; j++) { + Lblock[nblocks] = index[k]; + Lrows[ncb]++; + nblocks++; + k += LB_DESCRIPTOR + index[k + 1]; + } +#ifdef ISORT + isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]])); +#else + qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + } + + /* look for the first local symmetric nonzero block match */ + if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[]."); + if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[]."); + + log_memory( 2 * nsupers * iword, stat ); + + for (lb = 0; lb < nsupers; lb++) + sf_block_l[lb] = nsupers; + i = 0; + j = 0; + for (jb = 0; jb < nlb; jb++) { + if (Urows[jb] > 0) { + ib = i + Urows[jb]; + lb = jb * Pc + mycol; + for (k = 0; k < Lrows[jb]; k++) { + while (Ublock[i] < Lblock[j] && i + 1 < ib) + i++; + + if (Ublock[i] == Lblock[j]) { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[jb] - k); + k = Lrows[jb]; + } else { + j++; + } + } + i = ib; + } else { + j += Lrows[jb]; + } + } + if (mycol < nsupers % grid->npcol) { + if (Urows[nlb] > 0) { + ib = i + Urows[nlb]; + lb = nlb * Pc + mycol; + for (k = 0; k < Lrows[nlb]; k++) { + while (Ublock[i] < Lblock[j] && i + 1 < ib) + i++; + + if (Ublock[i] == Lblock[j]) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[nlb] - k); + k = Lrows[nlb]; + } + else + { + j++; + } + } + i = ib; + } else { + j += Lrows[nlb]; + } + } + + /* compute the first global symmetric matchs */ + MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, + grid->comm); + SUPERLU_FREE (sf_block_l); + log_memory( -nsupers * iword, stat ); + + /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */ + if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[]."); + if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[]."); + log_memory( 2 * nsupers * iword, stat ); + + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_u[lb] = 0; + + nblocks = 0; + /* from U-factor */ + for (i = 0, jb = 0; jb < nlb; jb++) { + lb = jb * Pc + mycol; + ib = i + Urows[jb]; + while (i < ib) { + if (Ublock[i] <= sf_block[lb]) { + nnodes_u[lb]++; + i++; + nblocks++; + } else { /* get out */ + i = ib; + } + } + i = ib; + } + if (mycol < nsupers % grid->npcol) { + lb = nlb * Pc + mycol; + ib = i + Urows[nlb]; + while (i < ib) { + if (Ublock[i] <= sf_block[lb]) { + nnodes_u[lb]++; + i++; + nblocks++; + } else { /* get out */ + i = ib; + } + } + i = ib; + } + + /* from L-factor */ + for (i = 0, jb = 0; jb < nlb; jb++) { + lb = jb * Pc + mycol; + ib = i + Lrows[jb]; + while (i < ib) { + if (Lblock[i] < sf_block[lb]) { + nnodes_l[lb]++; + i++; + nblocks++; + } else { + i = ib; + } + } + i = ib; + } + if (mycol < nsupers % grid->npcol) { + lb = nlb * Pc + mycol; + ib = i + Lrows[nlb]; + while (i < ib) { + if (Lblock[i] < sf_block[lb]) { + nnodes_l[lb]++; + i++; + nblocks++; + } else { + i = ib; + } + } + i = ib; + } + +#ifdef USE_ALLGATHER + /* insert local nodes in DAG */ + if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks))) + ABORT ("Malloc fails for edag_supno_l[]."); + edag_supno_l_bytes = (nsupers + nblocks) * iword; + log_memory(edag_supno_l_bytes, stat); + + iu = il = nblocks = 0; + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; + nblocks++; + if (mycol == pc) { + /* from U-factor */ + ib = iu + Urows[j]; + for (jb = 0; jb < nnodes_u[lb]; jb++) { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; + nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for (jb = 0; jb < nnodes_l[lb]; jb++) { + edag_supno_l[nblocks] = Lblock[il]; + il++; + nblocks++; + } + il = ib; + } + } + SUPERLU_FREE (nnodes_u); + log_memory(-nsupers * iword, stat); + + /* form global DAG on each processor */ + MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, + grid->comm); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for (lb = 1; lb < Pc * Pr; lb++) { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); + log_memory(nblocks * iword, stat); + + MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); + SUPERLU_FREE (edag_supno_l); + log_memory(-edag_supno_l_bytes, stat); + + if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) + ABORT ("Malloc fails for edag_supno[]."); + log_memory(nsupers * iword, stat); + + k = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (p = 0; p < Pc * Pr; p++) { + for (lb = 0; lb < nsupers; lb++) { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + for (lb = 0; lb < nsupers; lb++) { + if (nnodes_l[lb] > 0) + if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) + ABORT ("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for (p = 0; p < Pc * Pr; p++) { + for (lb = 0; lb < nsupers; lb++) { + jb = k + recvbuf[k] + 1; + k++; + for (; k < jb; k++) { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb]++; + } + } + } + SUPERLU_FREE (recvbuf); + log_memory(-nblocks * iword, stat); + +#else /* not USE_ALLGATHER */ + int nlsupers = nsupers / Pc; + if (mycol < nsupers % Pc) nlsupers++; + + /* insert local nodes in DAG */ + if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks))) + ABORT ("Malloc fails for edag_supno_l[]."); + edag_supno_l_bytes = (nlsupers + nblocks) * iword; + log_memory(edag_supno_l_bytes, stat); + + iu = il = nblocks = 0; + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (mycol == pc) { + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; + nblocks++; + /* from U-factor */ + ib = iu + Urows[j]; + for (jb = 0; jb < nnodes_u[lb]; jb++) { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; + nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for (jb = 0; jb < nnodes_l[lb]; jb++) { + edag_supno_l[nblocks] = Lblock[il]; + il++; + nblocks++; + } + il = ib; + } else if (nnodes_l[lb] + nnodes_u[lb] != 0) + printf (" # %d: nnodes[" IFMT "]=" IFMT "+" IFMT "\n", + grid->iam, lb, nnodes_l[lb], nnodes_u[lb]); + } + SUPERLU_FREE (nnodes_u); + log_memory(-nsupers * iword, stat); + + /* form global DAG on each processor */ + MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for (lb = 1; lb < Pc * Pr; lb++) { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); + log_memory(nblocks * iword, stat); + + MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); + + SUPERLU_FREE (edag_supno_l); + log_memory(-edag_supno_l_bytes, stat); + + if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) + ABORT ("Malloc fails for edag_supno[]."); + log_memory(nsupers * sizeof(int_t *), stat); + + k = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (p = 0; p < Pc * Pr; p++) { + yourcol = MYCOL (p, grid); + + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (yourcol == pc) { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + } + for (lb = 0; lb < nsupers; lb++) { + if (nnodes_l[lb] > 0) + if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) + ABORT ("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for (p = 0; p < Pc * Pr; p++) { + yourcol = MYCOL (p, grid); + + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (yourcol == pc) + { + jb = k + recvbuf[k] + 1; + k++; + for (; k < jb; k++) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb]++; + } + } + } + } + SUPERLU_FREE (recvbuf); + log_memory( -nblocks * iword , stat); + +#endif /* end USE_ALL_GATHER */ + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + for (i = 0; i < nsupers; i++) num_child[i] = 0; + for (i = 0; i < nsupers; i++) { + for (jb = 0; jb < nnodes_l[i]; jb++) { + num_child[edag_supno[i][jb]]++; + } + } + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for (i = 0; i < nsupers; i++) { + if (num_child[i] == 0) { + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ + nnodes++; + + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + + while (nnodes > 0) { + /*printf( "=== pop %d (%d) ===\n",head->id,i ); */ + ptr = head; + j = ptr->id; + head = ptr->next; + + perm_c_supno[i] = j; + SUPERLU_FREE (ptr); + i++; + nnodes--; + + for (jb = 0; jb < nnodes_l[j]; jb++) { + num_child[edag_supno[j][jb]]--; + if (num_child[edag_supno[j][jb]] == 0) { + nnodes++; + + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = edag_supno[j][jb]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id ); */ + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" ); */ + } + for (lb = 0; lb < nsupers; lb++) + if (nnodes_l[lb] > 0) SUPERLU_FREE (edag_supno[lb]); + + SUPERLU_FREE (num_child); + SUPERLU_FREE (edag_supno); + SUPERLU_FREE (nnodes_l); + SUPERLU_FREE (sf_block); + SUPERLU_FREE (sendcnts); + + log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat); + + SUPERLU_FREE (Ublock); + SUPERLU_FREE (Urows); + SUPERLU_FREE (Lblock); + SUPERLU_FREE (Lrows); + log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat); + } + /* ======================== * + * end of static scheduling * + * ======================== */ + + for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb; + +#if ( DEBUGlevel >= 1 ) + print_memorylog(stat, "after static schedule"); + check_perm_dist("perm_c_supno", nsupers, perm_c_supno); + check_perm_dist("iperm_c_supno", nsupers, iperm_c_supno); +#endif + + return 0; +} /* STATIC_SCHEDULE */ + diff --git a/SRC/ssuperlu_blas.c b/SRC/ssuperlu_blas.c new file mode 100644 index 00000000..5d820665 --- /dev/null +++ b/SRC/ssuperlu_blas.c @@ -0,0 +1,123 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Wrapper functions to call BLAS. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_sdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_sgemm(const char *transa, const char *transb,
+                  int m, int n, int k, float alpha, float *a,
+                  int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    sgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return sgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_strsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const float alpha, const float *a,
+                  const int lda, float *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    strsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return strsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_sger(const int m, const int n, const float alpha,
+                 const float *x, const int incx, const float *y,
+                 const int incy, float *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    sger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_sscal(const int n, const float alpha, float *x, const int incx)
+{
+    sscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_saxpy(const int n, const float alpha,
+    const float *x, const int incx, float *y, const int incy)
+{
+    saxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_sgemv(const char *trans, const int m,
+                  const int n, const float alpha, const float *a,
+                  const int lda, const float *x, const int incx,
+                  const float beta, float *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_strsv(char *uplo, char *trans, char *diag,
+                  int n, float *a, int lda, float *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
new file mode 100644
index 00000000..dbdabe9d
--- /dev/null
+++ b/SRC/ssuperlu_gpu.cu
@@ -0,0 +1,1780 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * 
+ */ + +//#define GPU_DEBUG + +#include "superlu_defs.h" + +#undef Reduce + +//#include + +#include "slustruct_gpu.h" + + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +// gpublasStatus_t checkGPUblas(gpublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != GPUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result)); +// assert(result == GPUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + float *nzval, int_t ldv, + float *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ + +__device__ inline +void sdevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + float *nzval, int ldv, + const float *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + nzval[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +/* Sherry: this routine is not used */ +#if 0 ////////////////////////////////////////////// +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif /////////////////////////////////// not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + float * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + float * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + sLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + float *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + float *LnzvalVec = A_gpu->LnzvalVec; + float *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + //int THREAD_BLOCK_SIZE =ldt; + + int* pfxStorage = (int*) &IndirectJ3[ldt]; + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + + float *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + float *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + float *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = blockDim.x / temp_nbrow; + + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) + IndirectJ1[thread_id] = 0; + + if (thread_id < blockDim.x) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_lptr, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + float *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_lptr[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = blockDim.x / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + sdevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int sSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + ssluGPU_t *sluGPU, gridinfo_t *grid +) +{ + int SCATTER_THREAD_BLOCK_SIZE=512; + + sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + sLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + float * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + float * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("sSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(float), + gpuMemcpyHostToDevice, FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(float), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(float) + + bigu_send_size * sizeof(float) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + float alpha = 1.0, beta = 0.0; + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + gpublasSetStream(gpublas_handle0, FunCallStream); + gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream); + gpublasSgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N, + nrows, ncols, ldu, &alpha, + &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, + &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, + &beta, A_gpu->scubufs[streamId].bigV, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end sSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(gpuDeviceProp devProp) +{ + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu) +{ + /* Free the L data structure on GPU */ + checkGPU(gpuFree(A_gpu->LrowindVec)); + checkGPU(gpuFree(A_gpu->LrowindPtr)); + + checkGPU(gpuFree(A_gpu->LnzvalVec)); + checkGPU(gpuFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + + /*freeing the pinned memory*/ + int_t streamId = 0; + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkGPU(gpuFreeHost(A_gpu->acc_L_buff)); + checkGPU(gpuFreeHost(A_gpu->acc_U_buff)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + + /* Free the U data structure on GPU */ + checkGPU(gpuFree(A_gpu->UrowindVec)); + checkGPU(gpuFree(A_gpu->UrowindPtr)); + + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated + + checkGPU(gpuFree(A_gpu->UnzvalVec)); + checkGPU(gpuFree(A_gpu->UnzvalPtr)); + + checkGPU(gpuFree(A_gpu->grid)); + + /* Free the Schur complement structure on GPU */ + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkGPU(gpuFree(A_gpu->indirect)); + // checkGPU(gpuFree(A_gpu->indirect2)); + checkGPU(gpuFree(A_gpu->xsup)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].usub)); + + checkGPU(gpuFree(A_gpu->local_l_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->jib_lookupVec)); + checkGPU(gpuFree(A_gpu->jib_lookupPtr)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->ijb_lookupVec)); + checkGPU(gpuFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void sPrint_matrix( char *desc, int_t m, int_t n, float * dA, int_t lda ) +{ + float *cPtr = (float *) malloc(sizeof(float) * lda * n); + checkGPU(gpuMemcpy( cPtr, dA, + lda * n * sizeof(float), gpuMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void sprintGPUStats(sLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + +/* Initialize the GPU side of the data structure. */ +int sinitSluGPU3D_t( + ssluGPU_t *sluGPU, // LU structures on GPU, see slustruct_gpu.h + sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + checkGPUErrors(gpuDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nGPUStreams = getnGPUStreams(); + + int SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) + { + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + + } + + if (grid3d->iam == 0) + { + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams); + fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + gpuStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + gpuStreamCreate(&(sluGPU->funCallStreams[streamId])); + gpublasCreate(&(sluGPU->gpublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (sLUstruct_gpu_t *) malloc (sizeof(sLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ + sCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end sinitSluGPU3D_t */ + + +int sinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + ssluGPU_t *sluGPU, gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* sinitD2Hreduce */ + +int sreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + sLUstruct_t *LUstruct +) +{ + sLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + float** Unzval_br_ptr = Llu->Unzval_br_ptr; + + gpuStream_t CopyStream; + sLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + float *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + float alpha = 1; + superlu_saxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + float *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + float alpha = 1; + superlu_saxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* sreduceGPUlu */ + + +int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int ssendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + gpuStream_t CopyStream = sluGPU->CopyStream;; + sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(float), gpuMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(float), gpuMemcpyDeviceToHost, CopyStream ) ); + gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(float) + l_copy_len * sizeof(float); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int ssendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + float* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + ssluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] ssendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in ssendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(float)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(float)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(ssluGPU_t *sluGPU) +{ + return 0; +} +#endif + +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling sfree_LUsstruct_gpu(). */ +void sCopyLUToGPU3D ( + int* isNodeInMyGrid, + sLocalLU_t *A_host, /* distributed LU structure on host */ + ssluGPU_t *sluGPU, /* hold LU structure on GPU */ + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + sLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + gpuDeviceProp devProp; + gpuGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nGPUStreams = sluGPU->nGPUStreams; + /*pinned memory allocations. + Paged-locked memory by gpuMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nGPUStreams; streamId++ ) { + void *tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkGPUErrors(gpuMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, remain_l_max * sizeof(float) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (float *) tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, bigu_size * sizeof(float) )) ; + A_gpu->scubufs[streamId].bigU_host = (float *) tmp_ptr; + + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (float *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (float *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, remain_l_max * sizeof(float) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (float *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(float); + checkGPUErrors(gpuMalloc( &tmp_ptr, bigu_size * sizeof(float) )) ; + A_gpu->scubufs[streamId].bigU = (float *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(float); + checkGPUErrors(gpuMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, buffer_size * sizeof(float))) ; + A_gpu->scubufs[streamId].bigV = (float *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(float); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->GemmEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ScatterEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(float) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(float) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(float) * 1e-9, + u_val_len * sizeof(float) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_val_len * sizeof(float))); + A_gpu->LnzvalVec = (float *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(float))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_val_len * sizeof(float))); + A_gpu->UnzvalVec = (float *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(float))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, sizeof(sLUstruct_gpu_t))) ; + *dA_gpu = (sLUstruct_gpu_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(sLUstruct_gpu_t), gpuMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end sCopyLUToGPU3D */ + + + +int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + sLUValSubBuf_t*LUvsb, + sLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + ssluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all gpu streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + sinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + sreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + sreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* sreduceAllAncestors3d_GPU */ + + +void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} diff --git a/SRC/ssuperlu_gpu.hip.cpp b/SRC/ssuperlu_gpu.hip.cpp new file mode 100644 index 00000000..e474ab12 --- /dev/null +++ b/SRC/ssuperlu_gpu.hip.cpp @@ -0,0 +1 @@ +#include "ssuperlu_gpu.cu" \ No newline at end of file diff --git a/SRC/streeFactorization.c b/SRC/streeFactorization.c new file mode 100644 index 00000000..5ad8adc7 --- /dev/null +++ b/SRC/streeFactorization.c @@ -0,0 +1,763 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t sLluBufInit(sLUValSubBuf_t* LUvsb, sLUstruct_t *LUstruct)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = floatMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = floatMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+sdiagFactBufs_t** sinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    sdiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (sdiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(sdiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (sdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(sdiagFactBufs_t));
+        assert(dFBufs[i]);
+        sinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int sfreeDiagFactBufsArr(int_t mxLeafNode, sdiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+sLUValSubBuf_t** sLluBufInitArr(int_t numLA, sLUstruct_t *LUstruct)
+{
+    sLUValSubBuf_t** LUvsbs = (sLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(sLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (sLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+        sLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int sLluBufFreeArr(int_t numLA, sLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t sinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  sscuBufs_t* scuBufs,
+                  sLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = sgetBigV(ldt, num_threads);
+    scuBufs->bigU = sgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int sfreeScuBufs(sscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t sinitDiagFactBufs(int_t ldt, sdiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t sdenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    sscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    sLUValSubBuf_t* LUvsb,
+    sdiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	sIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	sUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	sIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	sWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	sWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= sSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            float* bigV = scuBufs->bigV;
+            float* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                float** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                float *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                float *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                sblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* sdenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t ssparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    sscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    sLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    sdiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter ssparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = sSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            float* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    sLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    sUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit ssparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* ssparseTreeFactor_ASYNC */
diff --git a/SRC/streeFactorizationGPU.c b/SRC/streeFactorizationGPU.c
new file mode 100644
index 00000000..b136ebbf
--- /dev/null
+++ b/SRC/streeFactorizationGPU.c
@@ -0,0 +1,759 @@
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "slustruct_gpu.h" + +//#include "cblas.h" + +#ifdef GPU_ACC ///////////////// enable GPU + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +#if 0 +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} +#endif + +int ssparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + sscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + sLUValSubBuf_t **LUvsbs, // size = num Look ahead + sdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + ssluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams + + if (superlu_acc_offload) + ssyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + sinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + sinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = sSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + float *bigV = scuBufs->bigV; + float *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#ifdef _OPENMP +#pragma omp parallel /* Look-ahead update on CPU */ +#endif + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + sblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + sblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + sblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + sinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + sreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#ifdef _OPENMP +#pragma omp parallel +#endif + { + /* Master thread performs Schur complement update on GPU. */ +#ifdef _OPENMP +#pragma omp master +#endif + { + if (superlu_acc_offload) + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nGPUStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + swaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + ssendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + sSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#ifdef _OPENMP +#pragma omp for +#endif + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + sblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + sLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + sUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end ssparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/strfAux.c b/SRC/strfAux.c new file mode 100644 index 00000000..6a26e5e6 --- /dev/null +++ b/SRC/strfAux.c @@ -0,0 +1,758 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Auxiliary routine for 3D factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* sInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void sinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, sLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    sinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t sSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    float *bigU,
+    int_t* Lsub_buf,
+    float *Lval_buf,
+    int_t* Usub_buf,
+    float *Uval_buf,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        sgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* sSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t sSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    sscuBufs_t* scuBufs, sLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, sLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    float * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    float* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        sRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            sRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* sSchurComplementSetupGPU */
+
+
+float* sgetBigV(int_t ldt, int_t num_threads)
+{
+    float *bigV;
+    if (!(bigV = floatMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+float* sgetBigU(int_t nsupers, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    float* bigU = floatMalloc_dist(bigu_size);
+
+    return bigU;
+} /* sgetBigU */
+
+
+trf3Dpartition_t* sinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      sLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter sinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    sinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    sLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+    sLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit sinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* sinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void sDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter sDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit sDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/strfCommWrapper.c b/SRC/strfCommWrapper.c
new file mode 100644
index 00000000..de9cc01f
--- /dev/null
+++ b/SRC/strfCommWrapper.c
@@ -0,0 +1,548 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t sDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     float *BlockUFactor,
+                     float *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        sIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        sIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Sgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        sPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        sISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        sISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t sLPanelTrSolve( int_t k,   int_t* factored_L,
+		      float* BlockUFactor,
+		      gridinfo_t *grid,
+		      sLUstruct_t *LUstruct)
+{
+    double alpha = 1.0;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* sLPanelTrSolve */
+
+int_t sLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    float* BlockUFactor,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    sUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    sLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* sLPanelUpdate */
+
+#define BL  32
+
+int_t sUPanelTrSolve( int_t k,  
+                     float* BlockLFactor,
+                     float* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        float *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        float *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                float *tempv = bigV +  thread_id * ldt * ldt;
+                sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        float *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            float *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            float *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    float *tempv = bigV +  thread_id * ldt * ldt;
+                    sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* sUPanelTrSolve */
+
+int_t sUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    float* BlockLFactor,
+                    float* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    sUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t sIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    float* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        float* lusup = Lnzval_bc_ptr[lk];
+
+        sIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            sIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t sIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, float* Uval_buf,
+    			     gridinfo_t *grid, sLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        float* uval = Unzval_br_ptr[lk];
+        sIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            sIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            sWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            sWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/superlu_FCnames.h b/SRC/superlu_FCnames.h
index 9bd95e2a..97b711a0 100644
--- a/SRC/superlu_FCnames.h
+++ b/SRC/superlu_FCnames.h
@@ -24,49 +24,75 @@ at the top-level directory.
 #define __SUPERLU_FCNAMES
 
 /* These are the functions defined in F90 wraper */
-#define f_create_gridinfo_handle         FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE)
-#define f_create_options_handle          FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE)
-#define f_create_ScalePerm_handle        FC_GLOBAL(f_create_scaleperm_handle,F_CREATE_SCALEPERM_HANDLE)
-#define f_create_LUstruct_handle         FC_GLOBAL(f_create_lustruct_handle,F_CREATE_LUSTRUCT_HANDLE)
-#define f_create_SOLVEstruct_handle      FC_GLOBAL(f_create_solvestruct_handle,F_CREATE_SOLVESTRUCT_HANDLE)
-#define f_create_SuperMatrix_handle      FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE)
-#define f_destroy_gridinfo_handle        FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE)
-#define f_destroy_options_handle         FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE)
-#define f_destroy_ScalePerm_handle       FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE)
-#define f_destroy_LUstruct_handle        FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE)
-#define f_destroy_SOLVEstruct_handle     FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE)
-#define f_destroy_SuperMatrix_handle     FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE)
-#define f_create_SuperLUStat_handle      FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE)
-#define f_destroy_SuperLUStat_handle     FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE)
-#define f_get_gridinfo                   FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO)
-#define f_get_SuperMatrix                FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX)
-#define f_set_SuperMatrix                FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX)
-#define f_get_CompRowLoc_Matrix          FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX)
-#define f_set_CompRowLoc_Matrix          FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX)
-#define f_get_superlu_options            FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS)
-#define f_set_superlu_options            FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS)
-#define f_set_default_options            FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS)
-#define f_superlu_gridinit               FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT)
-#define f_superlu_gridmap                FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP)
-#define f_superlu_gridexit               FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT)
-#define f_ScalePermstructInit            FC_GLOBAL(f_scalepermstructinit,F_SCALEPERMSTRUCTINIT)
-#define f_ScalePermstructFree            FC_GLOBAL(f_scalepermstructfree,F_SCALEPERMSTRUCTFREE)
-#define f_PStatInit                      FC_GLOBAL(f_pstatinit,F_PSTATINIT)
-#define f_PStatFree                      FC_GLOBAL(f_pstatfree,F_PSTATFREE)
-#define f_LUstructInit                   FC_GLOBAL(f_lustructinit,F_LUSTRUCTINIT)
-#define f_LUstructFree                   FC_GLOBAL(f_lustructfree,F_LUSTRUCTFREE)
-#define f_Destroy_LU                     FC_GLOBAL(f_destroy_lu,F_DESTROY_LU)
-#define f_dCreate_CompRowLoc_Mat_dist    FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST)
-#define f_zCreate_CompRowLoc_Mat_dist    FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST)
-#define f_Destroy_CompRowLoc_Mat_dist    FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST)
-#define f_Destroy_SuperMat_Store_dist    FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST)
-#define f_dSolveFinalize                 FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE)
-#define f_zSolveFinalize                 FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE)
-#define f_pdgssvx                        FC_GLOBAL(f_pdgssvx,F_PDGSSVX)
-#define f_pzgssvx                        FC_GLOBAL(f_pzgssvx,F_PZGSSVX)
-#define f_dcreate_dist_matrix            FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX)
-#define f_zcreate_dist_matrix            FC_GLOBAL(f_zcreate_dist_matrix,F_ZCREATE_DIST_MATRIX)
-#define f_check_malloc                   FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC)
+#define f_create_gridinfo_handle       FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE)
+#define f_create_gridinfo3d_handle     FC_GLOBAL(f_create_gridinfo3d_handle,F_CREATE_GRIDINFO3D_HANDLE)
+#define f_create_options_handle        FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE)
+#define f_create_SuperMatrix_handle    FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE)
+#define f_destroy_gridinfo_handle      FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE)
+#define f_destroy_options_handle       FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE)
+#define f_destroy_ScalePerm_handle     FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE)
+#define f_destroy_LUstruct_handle      FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE)
+#define f_destroy_SOLVEstruct_handle   FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE)
+#define f_destroy_SuperMatrix_handle   FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE)
+#define f_create_SuperLUStat_handle    FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE)
+#define f_destroy_SuperLUStat_handle   FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE)
+#define f_get_gridinfo                 FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO)
+#define f_get_gridinfo3d               FC_GLOBAL(f_get_gridinfo3d,F_GET_GRIDINFO3D)
+#define f_get_SuperMatrix              FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX)
+#define f_set_SuperMatrix              FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX)
+#define f_get_CompRowLoc_Matrix        FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX)
+#define f_set_CompRowLoc_Matrix        FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX)
+#define f_get_superlu_options          FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS)
+#define f_set_superlu_options          FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS)
+#define f_set_default_options          FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS)
+#define f_superlu_gridinit             FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT)
+#define f_superlu_gridinit3d           FC_GLOBAL(f_superlu_gridinit3d,F_SUPERLU_GRIDINIT3D)
+#define f_superlu_gridmap              FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP)
+#define f_superlu_gridexit             FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT)
+#define f_PStatInit                    FC_GLOBAL(f_pstatinit,F_PSTATINIT)
+#define f_PStatFree                    FC_GLOBAL(f_pstatfree,F_PSTATFREE)
+#define f_Destroy_CompRowLoc_Mat_dist  FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST)
+#define f_Destroy_SuperMat_Store_dist  FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST)
+#define f_check_malloc                 FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC)
+
+////// double
+#define f_dcreate_ScalePerm_handle     FC_GLOBAL(f_dcreate_scaleperm_handle,F_DCREATE_SCALEPERM_HANDLE)
+#define f_dcreate_LUstruct_handle      FC_GLOBAL(f_dcreate_lustruct_handle,F_DCREATE_LUSTRUCT_HANDLE)
+#define f_dcreate_SOLVEstruct_handle   FC_GLOBAL(f_dcreate_solvestruct_handle,F_DCREATE_SOLVESTRUCT_HANDLE)
+#define f_dScalePermstructInit         FC_GLOBAL(f_dscalepermstructinit,F_DSCALEPERMSTRUCTINIT)
+#define f_dScalePermstructFree         FC_GLOBAL(f_dscalepermstructfree,F_DSCALEPERMSTRUCTFREE)
+#define f_dLUstructInit                FC_GLOBAL(f_dlustructinit,F_DLUSTRUCTINIT)
+#define f_dLUstructFree                FC_GLOBAL(f_dlustructfree,F_DLUSTRUCTFREE)
+#define f_dDestroy_LU_SOLVE_struct     FC_GLOBAL(f_ddestroy_lu_solve_struct,F_DDESTROY_LU_SOLVE_STRUCT)
+#define f_dDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_ddestroy_lu_solve_struct_3d,F_DDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_dDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_ddestroy_a3d_gathered_on_2d,F_DDESTROY_A3D_GATHERED_ON_2D)
+
+#define f_dCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST)
+#define f_dSolveFinalize               FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE)
+#define f_pdgssvx                      FC_GLOBAL(f_pdgssvx,F_PDGSSVX)
+#define f_pdgssvx3d                    FC_GLOBAL(f_pdgssvx3d,F_PDGSSVX3D)
+#define f_dcreate_dist_matrix          FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX)
+#define f_dcreate_matrix_x_b           FC_GLOBAL(f_dcreate_matrix_x_b,F_DCREATE_MATRIX_X_B)
+#define f_dcreate_matrix_x_b_3d        FC_GLOBAL(f_dcreate_matrix_x_b_3d,F_DCREATE_MATRIX_X_B_3D)
+
+////// complex16
+#define f_zcreate_ScalePerm_handle     FC_GLOBAL(f_zcreate_scaleperm_handle,F_ZCREATE_SCALEPERM_HANDLE)
+#define f_zcreate_LUstruct_handle      FC_GLOBAL(f_zcreate_lustruct_handle,F_ZCREATE_LUSTRUCT_HANDLE)
+#define f_zcreate_SOLVEstruct_handle   FC_GLOBAL(f_zcreate_solvestruct_handle,F_ZCREATE_SOLVESTRUCT_HANDLE)
+#define f_zScalePermstructInit         FC_GLOBAL(f_zscalepermstructinit,F_ZSCALEPERMSTRUCTINIT)
+#define f_zScalePermstructFree         FC_GLOBAL(f_zscalepermstructfree,F_ZSCALEPERMSTRUCTFREE)
+#define f_zLUstructInit                FC_GLOBAL(f_zlustructinit,F_ZLUSTRUCTINIT)
+#define f_zLUstructFree                FC_GLOBAL(f_zlustructfree,F_ZLUSTRUCTFREE)
+#define f_zDestroy_LU_SOLVE_struct     FC_GLOBAL(f_zdestroy_lu_solve_struct,F_ZDESTROY_LU_SOLVE_STRUCT)
+#define f_zDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_zdestroy_lu_solve_struct_3d,F_ZDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_zDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_zdestroy_a3d_gathered_on_2d,F_ZDESTROY_A3D_GATHERED_ON_2D)
+
+#define f_zCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST)
+#define f_zSolveFinalize               FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE)
+#define f_pzgssvx                      FC_GLOBAL(f_pzgssvx,F_PZGSSVX)
+#define f_pzgssvx3d                    FC_GLOBAL(f_pzgssvx3d,F_PZGSSVX3D)
+#define f_zcreate_matrix_x_b           FC_GLOBAL(f_zcreate_matrix_x_b,F_ZCREATE_MATRIX_X_B)
+#define f_zcreate_matrix_x_b_3d        FC_GLOBAL(f_zcreate_matrix_x_b_3d,F_ZCREATE_MATRIX_X_B_3D)
 
 /* BLAS */
 #define sasum_    FC_GLOBAL(sasum,SASUM)
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index e258745b..9f94c037 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -334,9 +334,12 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0.
+                            This needs to be peresistent between
+			    3D factorization and solve.  */
 } dSOLVEstruct_t;
 
-#if 0 
+
 
 /*==== For 3D code ====*/
 
@@ -388,7 +391,7 @@ typedef struct
     int_t bigu_size;
     int_t offloadCondition;
     int_t superlu_acc_offload;
-    int_t nCudaStreams;
+    int_t nGPUStreams;
 } HyP_t;
 
 typedef struct 
@@ -423,13 +426,13 @@ typedef struct
 {
     double *bigU;
     double *bigV;
-} scuBufs_t;
+} dscuBufs_t;
 
 typedef struct
 {   
     double* BlockLFactor;
     double* BlockUFactor;
-} diagFactBufs_t;
+} ddiagFactBufs_t;
 
 typedef struct
 {
@@ -439,7 +442,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
-#endif
+//#endif
 /*=====================*/
 
 /***********************************************************************
@@ -488,7 +491,7 @@ extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
 extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
-extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
+extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
 				  double **, int *, FILE *, char *, gridinfo_t *);
 
 extern void   dScalePermstructInit(const int_t, const int_t, 
@@ -534,6 +537,7 @@ extern void  pdCompute_Diag_Inv(int_t, dLUstruct_t *,gridinfo_t *, SuperLUStat_t
 extern int  dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
 		       int_t, dLUstruct_t *, gridinfo_t *, dSOLVEstruct_t *);
 extern void dSolveFinalize(superlu_dist_options_t *, dSOLVEstruct_t *);
+extern void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *, gridinfo3d_t *);
 extern int_t pdgstrs_init(int_t, int_t, int_t, int_t,
                           int_t [], int_t [], gridinfo_t *grid,
 	                  Glu_persist_t *, dSOLVEstruct_t *);
@@ -558,7 +562,7 @@ extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
                         int_t* lsub, int_t* usub, double* tempv,
                         int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr,
                         gridinfo_t * grid);
-extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
+extern int_t pdgstrf(superlu_dist_options_t *, int, int, double anorm,
 		    dLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
 
 /* #define GPU_PROF
@@ -649,12 +653,13 @@ extern void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
 extern void dZero_CompRowLoc_Matrix_dist(SuperMatrix *);
 extern void dScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, double);
 extern void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, double);
-extern void dZeroLblocks(int, int_t, gridinfo_t *, dLUstruct_t *);
+extern void dZeroLblocks(int, int, gridinfo_t *, dLUstruct_t *);
+extern void dZeroUblocks(int iam, int n, gridinfo_t *, dLUstruct_t *);
 extern void    dfill_dist (double *, int_t, double);
 extern void    dinf_norm_error_dist (int_t, int_t, double*, int_t,
                                      double*, int_t, gridinfo_t*);
 extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
-				double [], int_t , gridinfo_t *);
+				double [], int_t , MPI_Comm);
 extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
 			   double **, int_t **, int_t **);
 extern void  dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
@@ -702,14 +707,12 @@ extern void dgemm_(const char*, const char*, const int*, const int*, const int*,
                   const int*, const double*, double*, const int*, int, int);
 extern void dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*, int, int, int);
-extern void dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*,
-                  int*, int, int, int, int);
-extern void dgemv_(char *, int *, int *, double *, double *a, int *,
-                  double *, int *, double *, double *, int *, int);
-
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
+extern void dtrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const double*, const double*, const int*,
+		  double*, const int*, int, int, int, int);
+extern void dgemv_(const char *, const int *, const int *, const double *,
+                  const double *a, const int *, const double *, const int *,
+		  const double *, double *, const int *, int);
 
 #else
 extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
@@ -717,24 +720,62 @@ extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
                    const int*,  const double*, double*, const int*);
 extern int dtrsv_(char*, char*, char*, int*, double*, int*,
                   double*, int*);
-extern int dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*, int*);
-extern int dgemv_(char *, int *, int *, double *, double *a, int *,
-                  double *, int *, double *, double *, int *);
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
-
+extern int dtrsm_(const char*, const char*, const char*, const char*,
+                  const int*, const int*, const double*, const double*, const int*,
+		  double*, const int*);
+extern void dgemv_(const char *, const int *, const int *, const double *,
+                  const double *a, const int *, const double *, const int *,
+		  const double *, double *, const int *);
 #endif
 
-extern int dscal_(int *n, double *da, double *dx, int *incx);
-extern int daxpy_(int *n, double *za, double *zx, 
-	               int *incx, double *zy, int *incy);
+extern void dger_(const int*, const int*, const double*,
+                 const double*, const int*, const double*, const int*,
+		 double*, const int*);
+
+extern int dscal_(const int *n, const double *alpha, double *dx, const int *incx);
+extern int daxpy_(const int *n, const double *alpha, const double *x, 
+	               const int *incx, double *y, const int *incy);
+
+/* SuperLU BLAS interface: dsuperlu_blas.c  */
+extern int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a,
+                  int lda, double *b, int ldb, double beta, double *c, int ldc);
+extern int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag, const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb);
+extern int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda);
+extern int superlu_dscal(const int n, const double alpha, double *x, const int incx);
+extern int superlu_daxpy(const int n, const double alpha,
+    const double *x, const int incx, double *y, const int incy);
+extern int superlu_dgemv(const char *trans, const int m,
+                  const int n, const double alpha, const double *a,
+                  const int lda, const double *x, const int incx,
+                  const double beta, double *y, const int incy);
+extern int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                  int n, double *a, int lda, double *x, int incx);
+
+#ifdef SLU_HAVE_LAPACK
 // LAPACK routine
 extern void dtrtri_(char*, char*, int*, double*, int*, int*);
+#endif
 
-
-#if 0
 /*==== For 3D code ====*/
+extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
+                     int *ldb, double **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d);
+extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
+                           int *ldb, double **x, int *ldx,
+                           FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+    
+/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+   it to a NRformat_loc distributed in 2D grid in grid-0 */
+extern void dGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, double *B,
+				   int ldb, int nrhs, gridinfo3d_t *grid3d,
+				   NRformat_loc3d **);
+extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 		       dScalePermstruct_t *, double B[], int ldb, int nrhs,
@@ -750,13 +791,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
     /* from scatter.h */
 extern void
 dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info, double *L_mat, int_t ldl,
-                    double *U_mat, int_t ldu,  double *bigV,
+                    Remain_info_t *Remain_info, double *L_mat, int ldl,
+                    double *U_mat, int ldu,  double *bigV,
                     // int_t jj0,
                     int_t knsupc,  int_t klst,
                     int_t *lsub, int_t *usub, int_t ldt,
                     int_t thread_id,
-                    int_t *indirect, int_t *indirect2,
+                    int *indirect, int *indirect2,
                     int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                     int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
                     int_t *xsup, gridinfo_t *, SuperLUStat_t *
@@ -764,6 +805,8 @@ dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
                     , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                   );
+
+#ifdef _OPENMP
 /*this version uses a lock to prevent multiple thread updating the same block*/
 extern void
 dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
@@ -774,7 +817,7 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          int_t knsupc,  int_t klst,
                          int_t *lsub, int_t *usub, int_t ldt,
                          int_t thread_id,
-                         int_t *indirect, int_t *indirect2,
+                         int *indirect, int *indirect2,
                          int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                          int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
                          int_t *xsup, gridinfo_t *
@@ -782,11 +825,13 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
                          , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                        );
+#endif
+
 extern int_t
 dblock_gemm_scatterTopLeft( int_t lb,  int_t j, double* bigV,
 				 int_t knsupc,  int_t klst, int_t* lsub,
                                  int_t * usub, int_t ldt,
-				 int_t* indirect, int_t* indirect2,
+				 int* indirect, int* indirect2,
                                  HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                  SCT_t*SCT, SuperLUStat_t *
                                );
@@ -794,21 +839,21 @@ extern int_t
 dblock_gemm_scatterTopRight( int_t lb,  int_t j, double* bigV,
 				  int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,
-				  int_t* indirect, int_t* indirect2,
+				  int* indirect, int* indirect2,
                                   HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                   SCT_t*SCT, SuperLUStat_t * );
 extern int_t
 dblock_gemm_scatterBottomLeft( int_t lb,  int_t j, double* bigV,
 				    int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt, 
-				    int_t* indirect, int_t* indirect2,
+				    int* indirect, int* indirect2,
                                     HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                     SCT_t*SCT, SuperLUStat_t * );
 extern int_t 
 dblock_gemm_scatterBottomRight( int_t lb,  int_t j, double* bigV,
 				     int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,
-				     int_t* indirect, int_t* indirect2,
+				     int* indirect, int* indirect2,
                                      HyP_t* HyP, dLUstruct_t *, gridinfo_t*,
                                      SCT_t*SCT, SuperLUStat_t * );
 
@@ -841,10 +886,10 @@ extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d
 extern void d3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
 			    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
-extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-		       superlu_dist_options_t *options, dLocalLU_t *Llu,
-		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
-		       int_t * iperm_c_supno);
+//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+//		       superlu_dist_options_t *options, dLocalLU_t *Llu,
+//		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
+//		       int_t * iperm_c_supno);
 
 extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 				  sForest_t**  sForests, dLUstruct_t* LUstruct,
@@ -855,28 +900,6 @@ extern int_t dgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests,
 				   SCT_t* SCT );
 
     /* The following are from pdgstrf2.h */
-#if 0 // Sherry: same routine names, but different code !!!!!!!
-extern void pdgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t,
-                          int_t k, double thresh, Glu_persist_t *,
-			  gridinfo_t *, dLocalLU_t *, MPI_Request *U_diag_blk_send_req,
-			  int tag_ub, SuperLUStat_t *, int *info, SCT_t *);
-#ifdef _CRAY
-void pdgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *,
-                      dLocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
-#else
-void pdgstrs2_omp (int_t, int_t, int_t, int_t *, double*, Glu_persist_t *, gridinfo_t *,
-                      dLocalLU_t *, SuperLUStat_t *, Ublock_info_t *, double *bigV, int_t ldt, SCT_t *SCT );
-#endif
-
-#else 
-extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k,
-			  double thresh, Glu_persist_t *, gridinfo_t *,
-			  dLocalLU_t *, MPI_Request *, int tag_ub,
-			  SuperLUStat_t *, int *info);
-extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *,
-			 dLocalLU_t *, Ublock_info_t *, SuperLUStat_t *);
-#endif // same routine names   !!!!!!!!
-
 extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr,
 			  int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*);
 extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k,
@@ -891,7 +914,7 @@ extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
 			   double* uval, double *tempv);
 extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
                              int_t *usub, double* uval, double *tempv,
-                             int_t knsupc, int_t nsupr, double* lusup,
+                             int_t knsupc, int nsupr, double* lusup,
                              Glu_persist_t *Glu_persist)  ;
 extern void pdgstrs2
 #ifdef _CRAY
@@ -922,9 +945,9 @@ extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruc
 extern int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d);
 /*zero out LU non zero entries*/
 extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , dLUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(dLUstruct_t *);
+extern int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
+extern int dDeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
+extern int dDeAllocGlu_3d(dLUstruct_t *);
 
 /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
@@ -935,7 +958,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver,
                         double* Lval_buf, double* Uval_buf,
                         dLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
 /*reduces all nodelists required in a level*/
-int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
+extern int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
                            int_t** treePerm,
                            dLUValSubBuf_t* LUvsb,
                            dLUstruct_t* LUstruct,
@@ -975,21 +998,21 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha,
     /* from communication_aux.h */
 extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup,
 			     gridinfo_t *, int* msgcnt, MPI_Request *,
-			     int_t **ToSendR, int_t *xsup, int );
+			     int **ToSendR, int_t *xsup, int );
 extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup,
-			   gridinfo_t *, int* msgcnt, int_t **ToSendR,
+			   gridinfo_t *, int* msgcnt, int **ToSendR,
 			   int_t *xsup , SCT_t*, int);
 extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
 			    gridinfo_t *, int* msgcnt, MPI_Request *,
-			    int_t *ToSendD, int );
+			    int *ToSendD, int );
 extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
-			   gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int);
+			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
 extern int_t dIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
 			    double* Lval_buf, gridinfo_t *,
 			    MPI_Request *, dLocalLU_t *, int);
 extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*,
 			   dLocalLU_t *, gridinfo_t*, MPI_Request *, int);
-extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR,
+extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
 			MPI_Request *s, SCT_t*);
 extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
 extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *);
@@ -1062,7 +1085,7 @@ extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor,
 			    gridinfo_t *, dLUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, dLUstruct_t *);
+extern int getNsupers(int, Glu_persist_t *);
 extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
 extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
 extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
@@ -1073,33 +1096,29 @@ extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
 				   double* Uval_buf, gridinfo_t *, dLUstruct_t *);
 extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
-				      factNodelists_t*, scuBufs_t*,
+				      factNodelists_t*, dscuBufs_t*,
 				      dLUValSubBuf_t* LUvsb, gridinfo_t *,
 				      dLUstruct_t *, HyP_t*);
 extern double* dgetBigV(int_t, int_t);
 extern double* dgetBigU(int_t, gridinfo_t *, dLUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, dLUstruct_t *);
 // permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      dLUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, dLUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
 extern int_t dLluBufInit(dLUValSubBuf_t*, dLUstruct_t *);
 extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
-			  scuBufs_t*, dLUstruct_t*, gridinfo_t *);
-extern int dfreeScuBufs(scuBufs_t* scuBufs);
+			  dscuBufs_t*, dLUstruct_t*, gridinfo_t *);
+extern int dfreeScuBufs(dscuBufs_t* scuBufs);
 
 // the generic tree factoring code 
 extern int_t treeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1115,11 +1134,11 @@ extern int_t dsparseTreeFactor(
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     treeTopoInfo_t* treeTopoInfo,
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1134,11 +1153,11 @@ extern int_t ddenseTreeFactor(
     int_t nnnodes,          // number of nodes in the tree
     int_t *perm_c_supno,    // list of nodes in the order of factorization
     commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,   // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
     dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
+    ddiagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     superlu_dist_options_t *options,
@@ -1152,11 +1171,11 @@ extern int_t ddenseTreeFactor(
 extern int_t dsparseTreeFactor_ASYNC(
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1170,9 +1189,9 @@ extern int_t dsparseTreeFactor_ASYNC(
 );
 extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct);
 extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs);
-extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
-extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs);
-extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
+extern ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
+extern int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs);
+extern int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf);
 extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
 			    gridinfo_t *grid, SCT_t *SCT);
 extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
@@ -1182,11 +1201,11 @@ extern int_t ancestorFactor(
     int_t ilvl,             // level of factorization 
     sForest_t* sforest,
     commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
+    dscuBufs_t *scuBufs,     // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
     dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
+    ddiagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
     gEtreeInfo_t*   gEtreeInfo,        // global etree info
@@ -1198,8 +1217,8 @@ extern int_t ancestorFactor(
     double thresh,  SCT_t *SCT, int tag_ub, int *info
 );
 
-/*=====================*/
-#endif  // end 3D prototypes
+/*== end 3D prototypes ===================*/
+
 
 #ifdef __cplusplus
   }
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index c887df04..d71b930e 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -23,6 +23,9 @@ at the top-level directory.
  *     February 8, 2019    version 6.1.1
  *     November 12, 2019   version 6.2.0
  *     October 23, 2020    version 6.4.0
+ *     May 12, 2021        version 7.0.0
+ *     October 5, 2021     version 7.1.0
+ *     October 18, 2021    version 7.1.1
  * 
*/ @@ -47,9 +50,10 @@ at the top-level directory. #include #include #include -//#include +// #include #include #include +//#include Sherry: not available on Mac OS // /* Following is for vtune */ // #if 0 // #include @@ -72,10 +76,10 @@ at the top-level directory. * #endif * Versions 4.x and earlier do not include a #define'd version numbers. */ -#define SUPERLU_DIST_MAJOR_VERSION 6 -#define SUPERLU_DIST_MINOR_VERSION 4 -#define SUPERLU_DIST_PATCH_VERSION 0 -#define SUPERLU_DIST_RELEASE_DATE "October 23, 2020" +#define SUPERLU_DIST_MAJOR_VERSION 7 +#define SUPERLU_DIST_MINOR_VERSION 1 +#define SUPERLU_DIST_PATCH_VERSION 1 +#define SUPERLU_DIST_RELEASE_DATE "October 18, 2021" #include "superlu_dist_config.h" @@ -103,7 +107,7 @@ at the top-level directory. #elif defined (_LONGINT) typedef int64_t int_t; #define mpi_int_t MPI_LONG_LONG_INT - #define IFMT "%ld" + #define IFMT "%lld" #else /* Default */ typedef int int_t; #define mpi_int_t MPI_INT @@ -111,7 +115,6 @@ at the top-level directory. #endif - /* MPI C complex datatype */ #define SuperLU_MPI_COMPLEX MPI_C_COMPLEX #define SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX @@ -122,13 +125,20 @@ typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX; */ #include "superlu_FortranCInterface.h" -//#include "Cnames.h" #include "superlu_FCnames.h" #include "superlu_enum_consts.h" #include "supermatrix.h" #include "util_dist.h" #include "psymbfact.h" +#ifdef GPU_ACC +#include +#endif + + +#define MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ + + #define ISORT /* NOTE: qsort() has bug on Mac */ /*********************************************************************** @@ -193,7 +203,7 @@ typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX; * 0,1: for sending L to "right" * * 2,3: for sending off-diagonal blocks of U "down" * * 4 : for sending the diagonal blcok down (in pxgstrf2) */ -#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) +//#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) /* For numeric factorization. */ #if 0 @@ -340,16 +350,40 @@ typedef struct { int Iam; /* my process number */ } superlu_scope_t; -/*-- Process grid definition */ +/*-- 2D process grid definition */ typedef struct { MPI_Comm comm; /* MPI communicator */ superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */ superlu_scope_t cscp; /* process scope in columnwise, vertical direction */ - int iam; /* my process number in this scope */ + int iam; /* my process number in this grid */ int_t nprow; /* number of process rows */ int_t npcol; /* number of process columns */ } gridinfo_t; +/*-- 3D process grid definition */ +typedef struct { + MPI_Comm comm; /* MPI communicator */ + superlu_scope_t rscp; /* row scope */ + superlu_scope_t cscp; /* column scope */ + superlu_scope_t zscp; /* scope in third dimension */ + gridinfo_t grid2d; /* for using 2D functions */ + int iam; /* my process number in this grid */ + int_t nprow; /* number of process rows */ + int_t npcol; /* number of process columns */ + int_t npdep; /* number of replication factor in Z-dimension */ + int rankorder; /* = 0: Z-major ( default ) + * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 3 6 9 + * 1 4 7 10 + * 2 5 8 11 + * = 1: XY-major (need set env. var.: RANKORDER=XY) + * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 1 2 4 + * 5 6 7 8 + * 9 10 11 12 + */ +} gridinfo3d_t; + /* *-- The structures are determined by SYMBFACT and used thereafter. @@ -670,12 +704,13 @@ typedef struct { yes_no_t lookahead_etree; /* use etree computed from the serial symbolic factorization */ yes_no_t SymPattern; /* symmetric factorization */ + yes_no_t Algo3d; /* use 3D factorization/solve algorithms */ } superlu_dist_options_t; typedef struct { float for_lu; float total; - int_t expansions; + int expansions; int64_t nnzL, nnzU; } superlu_dist_mem_usage_t; @@ -693,12 +728,18 @@ typedef struct { int_t iukp; int_t jb; int_t full_u_cols; + int_t eo; /* order of elimination. For 3D algorithm */ + int_t ncols; + int_t StCol; } Ublock_info_t; typedef struct { int_t lptr; int_t ib; + int_t eo; /* order of elimination, for 3D code */ + int_t nrows; int_t FullRow; + int_t StRow; } Remain_info_t; typedef struct @@ -715,6 +756,195 @@ struct superlu_pair /**--------**/ +/*==== For 3D code ====*/ + +/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 * + * for each supernodal column, the five communications are: * + * 0,1: for sending L to "right" * + * 2,3: for sending off-diagonal blocks of U "down" * + * 4 : for sending the diagonal blcok down (in pxgstrf2) */ +// int tag_ub; +// #define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) + +// #undef SLU_MPI_TAG +/*defining my own MPI tags */ +/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 * + * for each supernodal column, the five communications are: * + * 0,1: for sending L to "right" * + * 2,3: for sending off-diagonal blocks of U "down" * + * 4 : for sending the diagonal blcok down (in pxgstrf2) * + * 5 : for sending the diagonal L block right () : added by piyush */ +#define SLU_MPI_TAG(id,num) ( (6*(num)+id) % tag_ub ) + +/*structs for quick look up */ +typedef struct +{ + int_t luptrj; + int_t lptrj; + int_t lib; +} local_l_blk_info_t; + +typedef struct +{ + int_t iuip; + int_t ruip; + int_t ljb; +} local_u_blk_info_t; + + +//global variable +extern double CPU_CLOCK_RATE; + +typedef struct +{ + int_t *perm_c_supno; + int_t *iperm_c_supno; +} perm_array_t; + +typedef struct +{ + int_t* factored; + int_t* factored_D; + int_t* factored_L; + int_t* factored_U; + int_t* IrecvPlcd_D; + int_t* IbcastPanel_L; /*I bcast and recv placed for the k-th L panel*/ + int_t* IbcastPanel_U; /*I bcast and recv placed for the k-th U panel*/ + int_t* numChildLeft; /*number of children left to be factored*/ + int_t* gpuLUreduced; /*New for GPU acceleration*/ +}factStat_t; + +typedef struct +{ + int_t next_col; + int_t next_k; + int_t kljb; + int_t kijb; + int_t copyL_kljb; + int_t copyU_kljb; + int_t l_copy_len; + int_t u_copy_len; + int_t *kindexL; + int_t *kindexU; + int_t mkrow; + int_t mkcol; + int_t ksup_size; +} d2Hreduce_t; + +typedef struct{ + int_t numChild; + int_t numDescendents; + int_t left; + int_t right; + int_t extra; + int_t* childrenList; + int_t depth; // distance from the top + double weight; // weight of the supernode + double iWeight; // weight of the whole subtree below + double scuWeight; // weight of schur complement update = max|n_k||L_k||U_k| +} treeList_t; + +typedef struct +{ + int_t numLvl; // number of level in tree; + int_t* eTreeTopLims; // boundaries of each level of size + int_t* myIperm; // Iperm for my tree size nsupers; + +} treeTopoInfo_t; + +typedef struct +{ + int_t* setree; // global supernodal elimination tree + int_t* numChildLeft; +} gEtreeInfo_t; + +typedef enum treePartStrat{ + ND, // nested dissection ordering or natural ordering + GD // greedy load balance stregy +}treePartStrat; + +typedef struct +{ + /* data */ + int_t nNodes; // total number of nodes + int_t* nodeList; // list of nodes, should be in order of factorization +#if 0 // Sherry: the following array is used on rForest_t. ??? + int_t* treeHeads; +#endif + /*topological information about the tree*/ + int_t numLvl; // number of Topological levels in the forest + int_t numTrees; // number of tree in the forest + treeTopoInfo_t topoInfo; // +#if 0 // Sherry fix: the following two structures are in treeTopoInfo_t. ??? + int_t* eTreeTopLims; // boundaries of each level of size + int_t* myIperm; // Iperm for my tree size nsupers; +#endif + + /*information about load balance*/ + double weight; // estimated cost + double cost; // measured cost + +} sForest_t; + +typedef struct +{ + /* data */ + MPI_Request* L_diag_blk_recv_req; + MPI_Request* L_diag_blk_send_req; + MPI_Request* U_diag_blk_recv_req; + MPI_Request* U_diag_blk_send_req; + MPI_Request* recv_req; + MPI_Request* recv_requ; + MPI_Request* send_req; + MPI_Request* send_requ; +} commRequests_t; + +typedef struct +{ + int_t *iperm_c_supno; + int_t *iperm_u; + int_t *perm_u; + int *indirect; + int *indirect2; + +} factNodelists_t; + +typedef struct +{ + int* msgcnt; + int* msgcntU; +} msgs_t; + +typedef struct xtrsTimer_t +{ + double trsDataSendXY; + double trsDataSendZ; + double trsDataRecvXY; + double trsDataRecvZ; + double t_pdReDistribute_X_to_B; + double t_pdReDistribute_B_to_X; + double t_forwardSolve; + double tfs_compute; + double tfs_comm; + double t_backwardSolve; + double tbs_compute; + double tbs_comm; + double tbs_tree[2*MAX_3D_LEVEL]; + double tfs_tree[2*MAX_3D_LEVEL]; + + // counters for communication and computation volume + + int_t trsMsgSentXY; + int_t trsMsgSentZ; + int_t trsMsgRecvXY; + int_t trsMsgRecvZ; + + double ppXmem; // perprocess X-memory +} xtrsTimer_t; + +/*==== For 3D code ====*/ + +/*====================*/ /*********************************************************************** * Function prototypes @@ -724,11 +954,14 @@ struct superlu_pair extern "C" { #endif -extern void set_default_options_dist(superlu_dist_options_t *); -extern void superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *); -extern void superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t, - gridinfo_t *); +extern void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *); +extern void superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *); extern void superlu_gridexit(gridinfo_t *); +extern void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep, + gridinfo3d_t *grid) ; +extern void superlu_gridexit3d(gridinfo3d_t *grid); + +extern void set_default_options_dist(superlu_dist_options_t *); extern void print_options_dist(superlu_dist_options_t *); extern void print_sp_ienv_dist(superlu_dist_options_t *); extern void Destroy_CompCol_Matrix_dist(SuperMatrix *); @@ -777,7 +1010,7 @@ extern int_t estimate_bigu_size (int_t, int_t **, Glu_persist_t *, /* Auxiliary routines */ extern double SuperLU_timer_ (); extern void superlu_abort_and_exit_dist(char *); -extern int_t sp_ienv_dist (int_t); +extern int sp_ienv_dist (int); extern void ifill_dist (int_t *, int_t, int_t); extern void super_stats_dist (int_t, int_t *); extern void get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *, @@ -824,6 +1057,18 @@ extern int_t psymbfact_LUXpand_RL extern int_t psymbfact_prLUXpand (int_t, int_t, int, Llu_symbfact_t *, psymbfact_stat_t *); +#ifdef ISORT +extern void isort (int_t N, int_t *ARRAY1, int_t *ARRAY2); +extern void isort1 (int_t N, int_t *ARRAY); +#else +int superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + #ifdef GPU_ACC /* GPU related */ extern void gemm_division_cpu_gpu (int *, int *, int *, int, int, int, int *, int); @@ -831,12 +1076,16 @@ extern int_t get_gpublas_nb (); extern int_t get_num_gpu_streams (); #endif +extern double estimate_cpu_time(int m, int n , int k); + extern int get_thread_per_process(); extern int_t get_max_buffer_size (); extern int_t get_min (int_t *, int_t); extern int compare_pair (const void *, const void *); extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t, int_t *, int_t *, int); +extern int get_acc_offload(); + /* Routines for debugging */ extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *); @@ -849,7 +1098,6 @@ extern int file_PrintInt10(FILE *, char *, int_t, int_t *); extern int file_PrintInt32(FILE *, char *, int, int *); extern int file_PrintLong10(FILE *, char *, int_t, int_t *); - /* Routines for Async_tree communication*/ #ifndef __SUPERLU_ASYNC_TREE /* allow multiple inclusions */ @@ -886,6 +1134,167 @@ extern yes_no_t C_BcTree_IsRoot(C_Tree* tree); extern void C_BcTree_forwardMessageSimple(C_Tree* tree, void* localBuffer, int msgSize); extern void C_BcTree_waitSendRequest(C_Tree* tree); +/*==== For 3D code ====*/ + +extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); +extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); +extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT); + +// permutation from superLU default +extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, + int_t *etree, Glu_persist_t *Glu_persist, + int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, + gridinfo_t *); + +/* Manipulate counters */ +extern void SCT_init(SCT_t*); +extern void SCT_print(gridinfo_t *grid, SCT_t* SCT); +extern void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern void SCT_free(SCT_t*); + +extern treeList_t* setree2list(int_t nsuper, int_t* setree ); +extern int free_treelist(int_t nsuper, treeList_t* treeList); + +// int_t calcTreeWeight(int_t nsupers, treeList_t* treeList, int_t* xsup); +extern int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup); +extern int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList); +extern int_t getCommonAncestorList(int_t k, int_t* alist, int_t* seTree, treeList_t* treeList); +extern int_t getCommonAncsCount(int_t k, treeList_t* treeList); +extern int_t* getPermNodeList(int_t nnode, // number of nodes + int_t* nlist, int_t* perm_c_sup,int_t* iperm_c_sup); +extern int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder); +extern int_t* getSubTreeRoots(int_t k, treeList_t* treeList); +// int_t* treeList2perm(treeList_t* , ..); +extern int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms); +// returns a concatenated permutation for three permutation arrays + +extern int_t* getGlobal_iperm(int_t nsupers, int_t nperms, int_t** perms, + int_t* nnodes); +extern int_t log2i(int_t index); +extern int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup); +extern int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount); +extern int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel); + +/*takes supernodal elimination tree and for each + supernode calculates "level" in elimination tree*/ +extern int_t* topological_ordering(int_t nsuper, int_t* setree); +extern int_t* Etree_LevelBoundry(int_t* perm,int_t* tsort_etree, int_t nsuper); + +/*calculated boundries of the topological levels*/ +extern int_t* calculate_num_children(int_t nsuper, int_t* setree); +extern void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper); +extern void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper); +extern void print_etree(int_t *setree, int_t* iperm, int_t nsuper); +extern int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree); +int* getLastDepBtree( int_t nsupers, treeList_t* treeList); + +/*returns array R with of size maxLevel with either 0 or 1 + R[i] = 1; then Tree[level-i] is set to zero= to only + accumulate the results */ +extern int_t* getReplicatedTrees( gridinfo3d_t* grid3d); + +/*returns indices in gNodeList of trees that belongs to my layer*/ +extern int_t* getGridTrees( gridinfo3d_t* grid3d); + + +/*returns global nodelist*/ +extern int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes, + int_t* treeHeads, treeList_t* treeList); + +/* calculate number of nodes in subtrees starting from treeHead[i]*/ +extern int_t* calcNumNodes(int_t maxLvl, int_t* treeHeads, treeList_t* treeList); + +/*Returns list of (last) node of the trees */ +extern int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList); + +extern int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm); + +extern int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree ); + +extern int_t* getMyEtLims(int_t nnodes, int_t* myTopOrder); + + +extern treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t nsupers, + int_t* myPerm,int_t* setree); + +extern sForest_t** getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); + +extern int_t** getTreePermForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t* sForests, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d); +extern int_t** getTreePermFr( int_t* myTreeIdxs, + sForest_t** sForests, gridinfo3d_t* grid3d); +extern int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, + sForest_t** sForests); +extern int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests); +extern int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests); +// int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d); +extern int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm); +extern void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid3d); +extern sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList); +extern sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); + + /* from trfAux.h */ +extern int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr); +extern void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup, + int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, + gridinfo3d_t * grid3d); +extern int getNsupers(int n, Glu_persist_t *Glu_persist); +extern int set_tag_ub(); +extern int getNumThreads(int); +extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *, int_t *, int_t *); +#if 0 // Sherry: conflicting with existing routine +extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *, gridinfo_t*, int_t* perm_u); +#endif +extern int_t* getFactPerm(int_t); +extern int_t* getFactIperm(int_t*, int_t); + +extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid); +extern int_t initFactStat(int_t nsupers, factStat_t* factStat); +extern int freeFactStat(factStat_t* factStat); +extern int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t*); +extern int freeFactNodelists(factNodelists_t* fNlists); +extern int_t initMsgs(msgs_t* msgs); +extern int_t getNumLookAhead(superlu_dist_options_t*); +extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss); + +extern msgs_t** initMsgsArr(int_t numLA); +extern int freeMsgsArr(int_t numLA, msgs_t **msgss); + +extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, + int_t *usub, Glu_persist_t *, SuperLUStat_t*); + + /* from sec_structs.h */ +extern int Cmpfunc_R_info (const void * a, const void * b); +extern int Cmpfunc_U_info (const void * a, const void * b); +extern int sort_R_info( Remain_info_t* Remain_info, int n ); +extern int sort_U_info( Ublock_info_t* Ublock_info, int n ); +extern int sort_R_info_elm( Remain_info_t* Remain_info, int n ); +extern int sort_U_info_elm( Ublock_info_t* Ublock_info, int n ); + + /* from pdgstrs.h */ +extern void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d); +extern void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid); + + /* from p3dcomm.c */ +extern int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d); +extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount); +extern int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); +extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d); + + extern int getnGPUStreams(); + extern int get_mpi_process_per_gpu (); + +/*=====================*/ + #ifdef __cplusplus } #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 3f6eaf2b..b99ed621 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -20,5 +20,4 @@ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 - #endif diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in index 3164559d..9c3142aa 100644 --- a/SRC/superlu_dist_config.h.in +++ b/SRC/superlu_dist_config.h.in @@ -20,5 +20,4 @@ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 - #endif diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h index 4d2a6d03..3103e46e 100644 --- a/SRC/superlu_enum_consts.h +++ b/SRC/superlu_enum_consts.h @@ -15,7 +15,6 @@ at the top-level directory. * Lawrence Berkeley National Lab, Univ. of California Berkeley, * October 1, 2010 * January 28, 2018 - * January 28, 2018 * */ @@ -78,7 +77,7 @@ typedef enum { SOL_COMM,/* communication for solve */ SOL_GEMM,/* gemm for solve */ SOL_TRSM,/* trsm for solve */ - SOL_TOT, /* LU-solve time*/ + SOL_TOT, /* LU-solve time*/ RCOND, /* estimate reciprocal condition number */ SOLVE, /* forward and back solves */ REFINE, /* perform iterative refinement */ diff --git a/SRC/superlu_gpu_utils.cu b/SRC/superlu_gpu_utils.cu new file mode 100644 index 00000000..b472e5b9 --- /dev/null +++ b/SRC/superlu_gpu_utils.cu @@ -0,0 +1,94 @@ +#include "superlu_defs.h" + +/*error reporting functions */ +gpuError_t checkGPU(gpuError_t result) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (result != gpuSuccess) { + fprintf(stderr, "GPU Runtime Error: %s\n", gpuGetErrorString(result)); + assert(result == gpuSuccess); + } +#endif + return result; +} + + + +__device__ int dnextpow2(int v) + +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; +} + + +typedef int pfx_dtype ; +__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) +{ + // extern __shared__ pfx_dtype temp[]; + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); + int thread_id = threadIdx.x; + int offset = 1; + if(2*thread_id < n_original) + temp[2*thread_id] = inOutArr[2*thread_id]; + else + temp[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + if(2*thread_id < n_original) + inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) + inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; + __syncthreads(); + +} /* end incScan */ + + +#if 0 // Not used +__global__ void gExScan(pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + incScan(inArr, temp, n); + +} +#endif diff --git a/SRC/superlu_gpu_utils.hip.cpp b/SRC/superlu_gpu_utils.hip.cpp new file mode 100644 index 00000000..23b669dc --- /dev/null +++ b/SRC/superlu_gpu_utils.hip.cpp @@ -0,0 +1 @@ +#include "superlu_gpu_utils.cu" \ No newline at end of file diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c index 4a23cccc..4eff3cb7 100644 --- a/SRC/superlu_grid.c +++ b/SRC/superlu_grid.c @@ -12,10 +12,11 @@ at the top-level directory. * \brief SuperLU grid utilities * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * September 1, 1999
  * February 8, 2019  version 6.1.1
+ * October 5, 2021
  * 
*/ @@ -27,17 +28,22 @@ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; #endif /*! \brief All processes in the MPI communicator must call this routine. + * + * On output, if a process is not in the SuperLU group, the following + * values are assigned to it: + * grid->comm = MPI_COMM_NULL + * grid->iam = -1 */ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, int_t npcol, gridinfo_t *grid) + int nprow, int npcol, gridinfo_t *grid) { int Np = nprow * npcol; - int_t *usermap; + int *usermap; int i, j, info; /* Make a list of the processes in the new communicator. */ - usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t)); + usermap = SUPERLU_MALLOC(Np*sizeof(int)); for (j = 0; j < npcol; ++j) for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j; @@ -55,30 +61,38 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid); SUPERLU_FREE(usermap); - + #ifdef GPU_ACC - /* Binding each MPI to a CUDA device */ - int devs, rank; + /* Binding each MPI to a GPU device */ + char *ttemp; + ttemp = getenv ("SUPERLU_BIND_MPI_GPU"); + + if (ttemp) { + int devs, rank; MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD?? gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices gpuSetDevice(rank % devs); // Set device to be used for GPU executions + } #endif - - } /*! \brief All processes in the MPI communicator must call this routine. + * + * On output, if a process is not in the SuperLU group, the following + * values are assigned to it: + * grid->comm = MPI_COMM_NULL + * grid->iam = -1 */ void superlu_gridmap( MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, - int_t npcol, - int_t usermap[], /* usermap(i,j) holds the process + int nprow, + int npcol, + int usermap[], /* usermap(i,j) holds the process number to be placed in {i,j} of the process grid. */ - int_t ldumap, /* The leading dimension of the + int ldumap, /* The leading dimension of the 2D array usermap[]. */ gridinfo_t *grid) { @@ -118,17 +132,17 @@ void superlu_gridmap( MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); /* Create the new communicator. */ /* NOTE: The call is to be executed by all processes in Bcomm, - even if they do not belong in the new group -- superlu_grp. */ + even if they do not belong in the new group -- superlu_grp. + The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */ MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); - /* Bail out if I am not in the group, superlu_group. */ + /* Bail out if I am not in the group "superlu_grp". */ if ( grid->comm == MPI_COMM_NULL ) { - grid->comm = Bcomm; - MPI_Comm_rank( Bcomm, &i ); - grid->iam = i; - /*grid->iam = -1;*/ - SUPERLU_FREE(pranks); - return; + // grid->comm = Bcomm; do not need to reassign to a valid communicator + grid->iam = -1; + //SUPERLU_FREE(pranks); + //return; + goto gridmap_out; } MPI_Comm_rank( grid->comm, &(grid->iam) ); @@ -176,14 +190,16 @@ void superlu_gridmap( } #endif + gridmap_out: SUPERLU_FREE(pranks); MPI_Group_free(&superlu_grp); MPI_Group_free(&mpi_base_group); -} + +} /* superlu_gridmap */ void superlu_gridexit(gridinfo_t *grid) { - if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) { + if ( grid->comm != MPI_COMM_NULL ) { /* Marks the communicator objects for deallocation. */ MPI_Comm_free( &grid->rscp.comm ); MPI_Comm_free( &grid->cscp.comm ); diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c new file mode 100644 index 00000000..25f7ba4b --- /dev/null +++ b/SRC/superlu_grid3d.c @@ -0,0 +1,273 @@ +/*! @file + * \brief SuperLU grid utilities + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * 
+ */ + +#include "superlu_ddefs.h" + +void superlu_gridmap3d( + MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int nprow, + int npcol, + int npdep, + gridinfo3d_t *grid); + + +/*! \brief All processes in the MPI communicator must call this routine. + */ +void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int nprow, int npcol, int npdep, gridinfo3d_t *grid) +{ + int Np = nprow * npcol * npdep; + int i, j, info; + + /* Make a list of the processes in the new communicator. */ + // usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t)); + // for (j = 0; j < npcol; ++j) + // for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j; + + /* Check MPI environment initialization. */ + MPI_Initialized( &info ); + if ( !info ) + ABORT("C main program must explicitly call MPI_Init()"); + + MPI_Comm_size( Bcomm, &info ); + if ( info < Np ) + ABORT("Number of processes is smaller than NPROW * NPCOL * NPDEP"); + + superlu_gridmap3d(Bcomm, nprow, npcol, npdep, grid); + + // SUPERLU_FREE(usermap); + +#ifdef GPU_ACC + /* Binding each MPI to a GPU device */ + char *ttemp; + ttemp = getenv ("SUPERLU_BIND_MPI_GPU"); + + if (ttemp) { + int devs, rank; + MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD?? + gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices + gpuSetDevice(rank % devs); // Set device to be used for GPU executions + } +#endif +} + + +/*! \brief All processes in the MPI communicator must call this routine. + * On output, if a process is not in the SuperLU group, the following + * values are assigned to it: + * grid->comm = MPI_COMM_NULL + * grid->iam = -1 + */ +void superlu_gridmap3d( + MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int nprow, + int npcol, + int npdep, + gridinfo3d_t *grid) +{ + MPI_Group mpi_base_group, superlu_grp; + int Np = nprow * npcol * npdep, mycol, myrow; + int *pranks; + int i, j, info; + +#if 0 // older MPI doesn't support complex in C + /* Create datatype in C for MPI complex. */ + if ( SuperLU_MPI_DOUBLE_COMPLEX == MPI_DATATYPE_NULL ) { + MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX ); + MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX ); + } +#endif + + /* Check MPI environment initialization. */ + MPI_Initialized( &info ); + if ( !info ) + ABORT("C main program must explicitly call MPI_Init()"); + + /* Make a list of the processes in the new communicator. */ + pranks = (int *) SUPERLU_MALLOC(Np * sizeof(int)); + for (j = 0; j < Np; ++j) + pranks[j] = j; + + /* + * Form MPI communicator for all. + */ + /* Get the group underlying Bcomm. */ + MPI_Comm_group( Bcomm, &mpi_base_group ); + /* Create the new group. */ + MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); + /* Create the new communicator. */ + /* NOTE: The call is to be executed by all processes in Bcomm, + even if they do not belong in the new group -- superlu_grp. + The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */ + MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); + + /* Bail out if I am not in the group, superlu_group. */ + if ( grid->comm == MPI_COMM_NULL ) { + //grid->comm = Bcomm; do not need to reassign to a valid communicator + grid->iam = -1; + //SUPERLU_FREE(pranks); + //return; + goto gridmap_out; + } + + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + /* Create 3D grid */ + int ndim = 3; + int dims[3]; + int reorder = 1; + int periodic[] = {0, 0, 0}; + int coords3d[3]; + int iam; + MPI_Comm superlu3d_comm; + + if (getenv("RANKORDER") && strcmp(getenv("RANKORDER"), "XY" )) + { + grid->rankorder = 1; // XY-major + + dims[0] = nprow; + dims[1] = npcol; + dims[2] = npdep; + + // create the new communicator + int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm); + + // get the coordinate of the processor + + MPI_Comm_rank (superlu3d_comm, &iam); + grid->iam = iam; + MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); + + int rowc[3] = {1, 0, 0}; + int colc[3] = {0, 1, 0}; + int depc[3] = {0, 0, 1}; + + // Partition a communicator into subgroups which form + // lower-dimensional cartesian subgrids + MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); /* XZ grids */ + MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); /* YZ grids */ + MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); /* XY grids */ + + grid->cscp.Np = nprow; + grid->cscp.Iam = coords3d[0]; + grid->rscp.Np = npcol; + grid->rscp.Iam = coords3d[1]; + grid->zscp.Np = npdep; + grid->zscp.Iam = coords3d[2]; + + // + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + // 2D communicator + int xyc[3] = {1, 1, 0}; + MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm)); + + } else { /* default */ + grid->rankorder = 0; // Z-major + + dims[1] = nprow; + dims[2] = npcol; + dims[0] = npdep; + + // get the communicator + int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm); + + //get the coordinate of the processor + + MPI_Comm_rank (superlu3d_comm, &iam); + grid->iam = iam; + MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); + + /* printf("(%d) My coordinats are (%d %d %d)\n", + iam, coords3d[0], coords3d[1], coords3d[2] ); + fflush(stdout); */ + + // create row communicator + + int rowc[3] = {0, 1, 0}; + int colc[3] = {0, 0, 1}; + int depc[3] = {1, 0, 0}; + + MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); + MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); + MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); + + // 2x3: 0,2,4 / 1,3,5 column-major + grid->cscp.Np = nprow; + grid->cscp.Iam = coords3d[1]; + grid->rscp.Np = npcol; + grid->rscp.Iam = coords3d[2]; + + grid->zscp.Np = npdep; + grid->zscp.Iam = coords3d[0]; + + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + // 2D communicator + int xyc[3] = {0, 1, 1}; + MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm)); + + } /* if RANKORDER */ + + + // Initialize grid2d; + + grid->grid2d.rscp = grid->rscp; + grid->grid2d.cscp = grid->cscp; + grid->grid2d.nprow = nprow; + grid->grid2d.npcol = npcol; + MPI_Comm_rank( grid->grid2d.comm, &(grid->grid2d.iam)); + + // grid->grid2d.cscp = grid->cscp; + +#if 1 + if ( (grid->zscp).Iam == 0) { + printf("(3d grid: layer 0) iam %d, grid->grid2d.iam %d\n", + grid->iam, (grid->grid2d).iam); + } + fflush(stdout); +#endif + + MPI_Comm_free( &superlu3d_comm ); // Sherry added + + gridmap_out: + SUPERLU_FREE(pranks); + MPI_Group_free( &superlu_grp ); + MPI_Group_free( &mpi_base_group ); +} + +void superlu_gridexit3d(gridinfo3d_t *grid) +{ + if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) { + /* Marks the communicator objects for deallocation. */ + MPI_Comm_free( &grid->rscp.comm ); + MPI_Comm_free( &grid->cscp.comm ); + MPI_Comm_free( &grid->zscp.comm ); + MPI_Comm_free( &grid->grid2d.comm ); + MPI_Comm_free( &grid->comm ); + } +#if 0 + if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) { + MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX ); + SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; /* some MPI system does not set this + to be NULL after Type_free */ + } +#endif +} diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 146d3acb..9af2d7ba 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -225,9 +225,12 @@ typedef struct { positions in the gathered x-vector. This is re-used in repeated calls to pzgsmv() */ int_t *xrow_to_proc; /* used by PDSLin */ + NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0. + This needs to be peresistent between + 3D factorization and solve. */ } zSOLVEstruct_t; -#if 0 + /*==== For 3D code ====*/ @@ -279,7 +282,7 @@ typedef struct int_t bigu_size; int_t offloadCondition; int_t superlu_acc_offload; - int_t nCudaStreams; + int_t nGPUStreams; } HyP_t; typedef struct @@ -314,13 +317,13 @@ typedef struct { doublecomplex *bigU; doublecomplex *bigV; -} scuBufs_t; +} zscuBufs_t; typedef struct { doublecomplex* BlockLFactor; doublecomplex* BlockUFactor; -} diagFactBufs_t; +} zdiagFactBufs_t; typedef struct { @@ -330,7 +333,7 @@ typedef struct lPanelInfo_t* lPanelInfo; } packLUInfo_t; -#endif +//#endif /*=====================*/ /*********************************************************************** @@ -379,7 +382,7 @@ extern int zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, gridinfo_t *); extern int zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, gridinfo_t *); -extern int zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *, +extern int zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, char *, gridinfo_t *); extern void zScalePermstructInit(const int_t, const int_t, @@ -425,6 +428,7 @@ extern void pzCompute_Diag_Inv(int_t, zLUstruct_t *,gridinfo_t *, SuperLUStat_t extern int zSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [], int_t, zLUstruct_t *, gridinfo_t *, zSOLVEstruct_t *); extern void zSolveFinalize(superlu_dist_options_t *, zSOLVEstruct_t *); +extern void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *, gridinfo3d_t *); extern int_t pzgstrs_init(int_t, int_t, int_t, int_t, int_t [], int_t [], gridinfo_t *grid, Glu_persist_t *, zSOLVEstruct_t *); @@ -449,7 +453,7 @@ extern void zscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup, int_t* lsub, int_t* usub, doublecomplex* tempv, int_t ** Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t * grid); -extern int_t pzgstrf(superlu_dist_options_t *, int, int, double, +extern int_t pzgstrf(superlu_dist_options_t *, int, int, double anorm, zLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*); /* #define GPU_PROF @@ -534,12 +538,13 @@ extern void zCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *); extern void zZero_CompRowLoc_Matrix_dist(SuperMatrix *); extern void zScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, doublecomplex); extern void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, doublecomplex); -extern void zZeroLblocks(int, int_t, gridinfo_t *, zLUstruct_t *); +extern void zZeroLblocks(int, int, gridinfo_t *, zLUstruct_t *); +extern void zZeroUblocks(int iam, int n, gridinfo_t *, zLUstruct_t *); extern void zfill_dist (doublecomplex *, int_t, doublecomplex); extern void zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t, doublecomplex*, int_t, gridinfo_t*); extern void pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t, - doublecomplex [], int_t , gridinfo_t *); + doublecomplex [], int_t , MPI_Comm); extern void zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, doublecomplex **, int_t **, int_t **); extern void zreadtriple_dist(FILE *, int_t *, int_t *, int_t *, @@ -582,14 +587,12 @@ extern void zgemm_(const char*, const char*, const int*, const int*, const int*, const int*, const doublecomplex*, doublecomplex*, const int*, int, int); extern void ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, doublecomplex*, int*, int, int, int); -extern void ztrsm_(char*, char*, char*, char*, int*, int*, - doublecomplex*, doublecomplex*, int*, doublecomplex*, - int*, int, int, int, int); -extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, - doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int); - -extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, - doublecomplex*, int*, doublecomplex*, int*); +extern void ztrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, + doublecomplex*, const int*, int, int, int, int); +extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, + const doublecomplex *a, const int *, const doublecomplex *, const int *, + const doublecomplex *, doublecomplex *, const int *, int); #else extern int zgemm_(const char*, const char*, const int*, const int*, const int*, @@ -597,24 +600,62 @@ extern int zgemm_(const char*, const char*, const int*, const int*, const int*, const int*, const doublecomplex*, doublecomplex*, const int*); extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, doublecomplex*, int*); -extern int ztrsm_(char*, char*, char*, char*, int*, int*, - doublecomplex*, doublecomplex*, int*, doublecomplex*, int*); -extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, - doublecomplex *, int *, doublecomplex *, doublecomplex *, int *); -extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, - doublecomplex*, int*, doublecomplex*, int*); - +extern int ztrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, + doublecomplex*, const int*); +extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, + const doublecomplex *a, const int *, const doublecomplex *, const int *, + const doublecomplex *, doublecomplex *, const int *); #endif -extern int zscal_(int *n, doublecomplex *da, doublecomplex *dx, int *incx); -extern int zaxpy_(int *n, doublecomplex *za, doublecomplex *zx, - int *incx, doublecomplex *zy, int *incy); +extern void zgeru_(const int*, const int*, const doublecomplex*, + const doublecomplex*, const int*, const doublecomplex*, const int*, + doublecomplex*, const int*); + +extern int zscal_(const int *n, const doublecomplex *alpha, doublecomplex *dx, const int *incx); +extern int zaxpy_(const int *n, const doublecomplex *alpha, const doublecomplex *x, + const int *incx, doublecomplex *y, const int *incy); + +/* SuperLU BLAS interface: zsuperlu_blas.c */ +extern int superlu_zgemm(const char *transa, const char *transb, + int m, int n, int k, doublecomplex alpha, doublecomplex *a, + int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc); +extern int superlu_ztrsm(const char *sideRL, const char *uplo, + const char *transa, const char *diag, const int m, const int n, + const doublecomplex alpha, const doublecomplex *a, + const int lda, doublecomplex *b, const int ldb); +extern int superlu_zger(const int m, const int n, const doublecomplex alpha, + const doublecomplex *x, const int incx, const doublecomplex *y, + const int incy, doublecomplex *a, const int lda); +extern int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx); +extern int superlu_zaxpy(const int n, const doublecomplex alpha, + const doublecomplex *x, const int incx, doublecomplex *y, const int incy); +extern int superlu_zgemv(const char *trans, const int m, + const int n, const doublecomplex alpha, const doublecomplex *a, + const int lda, const doublecomplex *x, const int incx, + const doublecomplex beta, doublecomplex *y, const int incy); +extern int superlu_ztrsv(char *uplo, char *trans, char *diag, + int n, doublecomplex *a, int lda, doublecomplex *x, int incx); + +#ifdef SLU_HAVE_LAPACK // LAPACK routine extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*); +#endif - -#if 0 /*==== For 3D code ====*/ +extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d); +extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d); + +/* Matrix distributed in NRformat_loc in 3D process grid. It converts + it to a NRformat_loc distributed in 2D grid in grid-0 */ +extern void zGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, doublecomplex *B, + int ldb, int nrhs, gridinfo3d_t *grid3d, + NRformat_loc3d **); +extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d); extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *, zScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs, @@ -630,13 +671,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); /* from scatter.h */ extern void zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, - Remain_info_t *Remain_info, doublecomplex *L_mat, int_t ldl, - doublecomplex *U_mat, int_t ldu, doublecomplex *bigV, + Remain_info_t *Remain_info, doublecomplex *L_mat, int ldl, + doublecomplex *U_mat, int ldu, doublecomplex *bigV, // int_t jj0, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t *, SuperLUStat_t * @@ -644,6 +685,8 @@ zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); + +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ extern void zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, @@ -654,7 +697,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t * @@ -662,11 +705,13 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); +#endif + extern int_t zblock_gemm_scatterTopLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -674,21 +719,21 @@ extern int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -721,10 +766,10 @@ extern void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d extern void z3D_printMemUse(trf3Dpartition_t* trf3Dpartition, zLUstruct_t *LUstruct, gridinfo3d_t * grid3d); -extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, - superlu_dist_options_t *options, zLocalLU_t *Llu, - int_t* xsup, int_t num_look_aheads, int_t nsupers, - int_t * iperm_c_supno); +//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, +// superlu_dist_options_t *options, zLocalLU_t *Llu, +// int_t* xsup, int_t num_look_aheads, int_t nsupers, +// int_t * iperm_c_supno); extern void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, sForest_t** sForests, zLUstruct_t* LUstruct, @@ -735,28 +780,6 @@ extern int_t zgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, SCT_t* SCT ); /* The following are from pdgstrf2.h */ -#if 0 // Sherry: same routine names, but different code !!!!!!! -extern void pzgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t, - int_t k, double thresh, Glu_persist_t *, - gridinfo_t *, zLocalLU_t *, MPI_Request *U_diag_blk_send_req, - int tag_ub, SuperLUStat_t *, int *info, SCT_t *); -#ifdef _CRAY -void pzgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *, - zLocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -void pzgstrs2_omp (int_t, int_t, int_t, int_t *, doublecomplex*, Glu_persist_t *, gridinfo_t *, - zLocalLU_t *, SuperLUStat_t *, Ublock_info_t *, doublecomplex *bigV, int_t ldt, SCT_t *SCT ); -#endif - -#else -extern void pzgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k, - double thresh, Glu_persist_t *, gridinfo_t *, - zLocalLU_t *, MPI_Request *, int tag_ub, - SuperLUStat_t *, int *info); -extern void pzgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, - zLocalLU_t *, Ublock_info_t *, SuperLUStat_t *); -#endif // same routine names !!!!!!!! - extern int_t zLpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, doublecomplex* lusup, int_t nsupr, SCT_t*); extern void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, @@ -771,7 +794,7 @@ extern int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, doublecomplex* uval, doublecomplex *tempv); extern int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, doublecomplex* uval, doublecomplex *tempv, - int_t knsupc, int_t nsupr, doublecomplex* lusup, + int_t knsupc, int nsupr, doublecomplex* lusup, Glu_persist_t *Glu_persist) ; extern void pzgstrs2 #ifdef _CRAY @@ -802,9 +825,9 @@ extern int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruc extern int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d); /*zero out LU non zero entries*/ extern int_t zzeroSetLU(int_t nnodes, int_t* nodeList , zLUstruct_t *, gridinfo3d_t*); -extern int AllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *); -extern int DeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*); -extern int DeAllocGlu_3d(zLUstruct_t *); +extern int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *); +extern int zDeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*); +extern int zDeAllocGlu_3d(zLUstruct_t *); /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] @@ -815,7 +838,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver, doublecomplex* Lval_buf, doublecomplex* Uval_buf, zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); /*reduces all nodelists required in a level*/ -int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, +extern int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct, @@ -855,21 +878,21 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, /* from communication_aux.h */ extern int_t zIBcast_LPanel (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t **ToSendR, int_t *xsup, int ); + int **ToSendR, int_t *xsup, int ); extern int_t zBcast_LPanel(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, - gridinfo_t *, int* msgcnt, int_t **ToSendR, + gridinfo_t *, int* msgcnt, int **ToSendR, int_t *xsup , SCT_t*, int); extern int_t zIBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t *ToSendD, int ); + int *ToSendD, int ); extern int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, - gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); + gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int); extern int_t zIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, doublecomplex* Lval_buf, gridinfo_t *, MPI_Request *, zLocalLU_t *, int); extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*, zLocalLU_t *, gridinfo_t*, MPI_Request *, int); -extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *s, SCT_t*); extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t zWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); @@ -942,7 +965,7 @@ extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFac gridinfo_t *, zLUstruct_t *); /* from trfAux.h */ -extern int_t getNsupers(int, zLUstruct_t *); +extern int getNsupers(int, Glu_persist_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); extern int freePackLUInfo(packLUInfo_t* packLUInfo); extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, @@ -953,33 +976,29 @@ extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, doublecomplex* Uval_buf, gridinfo_t *, zLUstruct_t *); extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, int_t*, int_t*, int_t*, gEtreeInfo_t*, - factNodelists_t*, scuBufs_t*, + factNodelists_t*, zscuBufs_t*, zLUValSubBuf_t* LUvsb, gridinfo_t *, zLUstruct_t *, HyP_t*); extern doublecomplex* zgetBigV(int_t, int_t); extern doublecomplex* zgetBigU(int_t, gridinfo_t *, zLUstruct_t *); -extern int_t getBigUSize(int_t, gridinfo_t *, zLUstruct_t *); // permutation from superLU default -extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, - zLUstruct_t *, gridinfo_t *); -extern void getSCUweight(int_t nsupers, treeList_t* treeList, zLUstruct_t *, gridinfo3d_t *); /* from treeFactorization.h */ extern int_t zLluBufInit(zLUValSubBuf_t*, zLUstruct_t *); extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t*, zLUstruct_t*, gridinfo_t *); -extern int zfreeScuBufs(scuBufs_t* scuBufs); + zscuBufs_t*, zLUstruct_t*, gridinfo_t *); +extern int zfreeScuBufs(zscuBufs_t* scuBufs); // the generic tree factoring code extern int_t treeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -995,11 +1014,11 @@ extern int_t zsparseTreeFactor( int_t *perm_c_supno, // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo, commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1014,11 +1033,11 @@ extern int_t zdenseTreeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1032,11 +1051,11 @@ extern int_t zdenseTreeFactor( extern int_t zsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead zLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + zdiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1050,9 +1069,9 @@ extern int_t zsparseTreeFactor_ASYNC( ); extern zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct); extern int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs); -extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); -extern int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs); -extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs); +extern int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, SCT_t *SCT); extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *); @@ -1062,11 +1081,11 @@ extern int_t ancestorFactor( int_t ilvl, // level of factorization sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead zLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + zdiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1078,8 +1097,8 @@ extern int_t ancestorFactor( double thresh, SCT_t *SCT, int tag_ub, int *info ); -/*=====================*/ -#endif // end 3D prototypes +/*== end 3D prototypes ===================*/ + #ifdef __cplusplus } diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h index 1c296530..1d720355 100644 --- a/SRC/supermatrix.h +++ b/SRC/supermatrix.h @@ -188,4 +188,33 @@ typedef struct { } NRformat_loc; +/* Data structure for storing 3D matrix on layer 0 of the 2D process grid + Only grid-0 has meanful values of these data structures. */ +typedef struct NRformat_loc3d +{ + NRformat_loc *A_nfmt; // Gathered A matrix on 2D grid-0 + void *B3d; // on the entire 3D process grid + int ldb; // relative to 3D process grid + int nrhs; + int m_loc; // relative to 3D process grid + void *B2d; // on 2D process layer grid-0 + + int *row_counts_int; // these counts are stored on 2D layer grid-0, + int *row_disp; // but count the number of {A, B} rows along Z-dimension + int *nnz_counts_int; + int *nnz_disp; + int *b_counts_int; + int *b_disp; + + /* The following 4 structures are used for scattering + solution X from 2D grid-0 back to 3D processes */ + int num_procs_to_send; + int *procs_to_send_list; + int *send_count_list; + int num_procs_to_recv; + int *procs_recv_from_list; + int *recv_count_list; +} NRformat_loc3d; + + #endif /* __SUPERLU_SUPERMATRIX */ diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c new file mode 100644 index 00000000..dc6144fa --- /dev/null +++ b/SRC/supernodalForest.c @@ -0,0 +1,971 @@ +/*! @file + * \brief SuperLU utilities + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ +#include +#include +#include "superlu_ddefs.h" +#if 0 +#include "sec_structs.h" +#include "supernodal_etree.h" +#include "load-balance/supernodalForest.h" +#include "p3dcomm.h" +#endif +#include + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + + +int_t calcTopInfoForest(sForest_t *forest, + int_t nsupers, int_t* setree); + + +sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList) +{ + // treePartStrat tps; + if (getenv("LBS")) + { + if (strcmp(getenv("LBS"), "ND" ) == 0) + { + return getNestDissForests( maxLvl, nsupers, setree, treeList); + } + if (strcmp(getenv("LBS"), "GD" ) == 0) + { + return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList); + } + } + else + { + return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList); + } + return 0; +} + +double calcNodeListWeight(int_t nnodes, int_t* nodeList, treeList_t* treeList) +{ + double trWeight = 0; + + for (int i = 0; i < nnodes; ++i) + { + trWeight += treeList[nodeList[i]].weight; + } + + return trWeight; +} + +sForest_t** getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList) +{ + + int_t numForests = (1 << maxLvl) - 1; + + // allocate space for forests + sForest_t** sForests = SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + + int_t* gTreeHeads = getTreeHeads(maxLvl, nsupers, treeList); + + int_t* gNodeCount = calcNumNodes(maxLvl, gTreeHeads, treeList); + int_t** gNodeLists = getNodeList(maxLvl, setree, gNodeCount, + gTreeHeads, treeList); + + SUPERLU_FREE(gTreeHeads); // Sherry added + + for (int i = 0; i < numForests; ++i) + { + sForests[i] = NULL; + if (gNodeCount[i] > 0) + { + sForests[i] = SUPERLU_MALLOC (sizeof (sForest_t)); + sForests[i]->nNodes = gNodeCount[i]; + sForests[i]->numTrees = 1; + sForests[i]->nodeList = gNodeLists[i]; + sForests[i]->weight = calcNodeListWeight(sForests[i]->nNodes, sForests[i]->nodeList, treeList); + + calcTopInfoForest(sForests[i], nsupers, setree); + } + } + + return sForests; +} + +static int_t* sortPtr; + +static int cmpfuncInd (const void * a, const void * b) +{ + return ( sortPtr[*(int_t*)a] - sortPtr[*(int_t*)b] ); +} +// doesn't sort A but gives the index of sorted array +int_t* getSortIndex(int_t n, int_t* A) +{ + int_t* idx = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + idx[i] = i; + } + sortPtr = A; + + qsort(idx, n, sizeof(int_t), cmpfuncInd); + + return idx; +} + + +static double* sortPtrDouble; + +static int cmpfuncIndDouble (const void * a, const void * b) +{ + return ( sortPtrDouble[*(int_t*)a] > sortPtrDouble[*(int_t*)b] ); +} +// doesn't sort A but gives the index of sorted array +int_t* getSortIndexDouble(int_t n, double* A) +{ + int_t* idx = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + idx[i] = i; + } + sortPtrDouble = A; + + qsort(idx, n, sizeof(int_t), cmpfuncIndDouble); + + return idx; +} + +static int cmpfunc(const void * a, const void * b) +{ + return ( *(int_t*)a - * (int_t*)b ); +} + + +int_t* permuteArr(int_t n, int_t* A, int_t* perm) +{ + int_t* permA = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + permA[i] = A[perm[i]]; + } + + return permA; +} + + + +int_t calcTopInfoForest(sForest_t *forest, + int_t nsupers, int_t* setree) +{ + + int_t nnodes = forest->nNodes; + int_t* nodeList = forest->nodeList; + + qsort(nodeList, nnodes, sizeof(int_t), cmpfunc); + int_t* myIperm = getMyIperm(nnodes, nsupers, nodeList); + int_t* myTopOrderOld = getMyTopOrder(nnodes, nodeList, myIperm, setree ); + int_t* myTopSortIdx = getSortIndex(nnodes, myTopOrderOld); + int_t* nodeListNew = permuteArr(nnodes, nodeList, myTopSortIdx); + int_t* myTopOrder = permuteArr(nnodes, myTopOrderOld, myTopSortIdx); + + SUPERLU_FREE(nodeList); + SUPERLU_FREE(myTopSortIdx); + SUPERLU_FREE(myIperm); + SUPERLU_FREE(myTopOrderOld); + myIperm = getMyIperm(nnodes, nsupers, nodeListNew); + + treeTopoInfo_t ttI; + ttI.myIperm = myIperm; + ttI.numLvl = myTopOrder[nnodes - 1] + 1; + ttI.eTreeTopLims = getMyEtLims(nnodes, myTopOrder); + + forest->nodeList = nodeListNew; + forest->topoInfo = ttI; + + SUPERLU_FREE(myTopOrder); // sherry added + + return 0; +} + +// #pragma optimize ("", off) + +double* getTreeWeights(int_t numTrees, int_t* gNodeCount, int_t** gNodeLists, treeList_t* treeList) +{ + double* gTreeWeights = DOUBLE_ALLOC(numTrees); + + // initialize with weight with whole subtree weights + for (int_t i = 0; i < numTrees; ++i) + { + gTreeWeights[i] = calcNodeListWeight(gNodeCount[i], gNodeLists[i], treeList); + } + + return gTreeWeights; + +} + +int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests) +{ + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeCount = INT_T_ALLOC (numForests); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + {gNodeCount[i] = sForests[i]->nNodes;} + else + { + gNodeCount[i] = 0; + } + } + return gNodeCount; +} + +int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests) +{ + int_t numForests = (1 << maxLvl) - 1; + int_t** gNodeLists = (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*)); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + { + gNodeLists[i] = sForests[i]->nodeList; + } + else + { + gNodeLists[i] = NULL; + } + } + + return gNodeLists; +} + +int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeToForstMap = INT_T_ALLOC (nsupers); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + { int_t nnodes = sForests[i]->nNodes; + int_t* nodeList = sForests[i]->nodeList; + for(int_t node = 0; nodenNodes; + } + + return myNodeCount; +} + + +int_t** getTreePermFr( int_t* myTreeIdxs, + sForest_t** sForests, gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + int_t** treePerm = (int_t** ) SUPERLU_MALLOC(sizeof(int_t*)*maxLvl); + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + treePerm[lvl] = NULL; + if (sForests[myTreeIdxs[lvl]]) + treePerm[lvl] = sForests[myTreeIdxs[lvl]]->nodeList; + } + return treePerm; +} + +int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm) +{ + int* isNodeInMyGrid = SUPERLU_MALLOC(nsupers * sizeof(int)); + + for(int i=0; igrid2d); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + double* gFrstCost = DOUBLE_ALLOC(numForests); + double* gFrstCostAcc = DOUBLE_ALLOC(numForests); + double* gFrstWt = DOUBLE_ALLOC(numForests); + + for (int i = 0; i < numForests; ++i) + { + gFrstCost[i] = 0; + gFrstWt[i] = 0; + if (sForests[i]) + { + gFrstCost[i] = sForests[i]->cost; + gFrstWt[i] = sForests[i]->weight; + } + } + + // reduce forest costs from all the grid; + MPI_Reduce(gFrstCost, gFrstCostAcc, numForests, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm); + + if (!grid3d->zscp.Iam && !grid->iam) + { + printf("|Forest | weight | cost | weight/Cost | \n"); + for (int i = 0; i < numForests; ++i) + { + /* code */ + double wt, ct; + wt = 0.0; + ct = 0.0; + if (sForests[i]) + { + wt = sForests[i]->weight; + } + printf("|%d | %.2e | %.2e | %.2e |\n", i, wt, gFrstCostAcc[i], 1e-9 * wt / gFrstCostAcc[i] ); + + } + + double* crPathCost = DOUBLE_ALLOC(numForests); + double* crPathWeight = DOUBLE_ALLOC(numForests); + // print the critcal path + for (int i = numForests - 1; i > -1 ; --i) + { + crPathCost[i] = gFrstCostAcc[i]; + crPathWeight[i] = gFrstWt[i]; + + if (2 * i + 1 < numForests) + { + + if (crPathCost[2 * i + 1] > crPathCost[2 * i + 2]) + { + /* code */ + crPathCost[i] += crPathCost[2 * i + 1]; + crPathWeight[i] += crPathWeight[2 * i + 1]; + } + else + { + crPathCost[i] += crPathCost[2 * i + 2]; + crPathWeight[i] += crPathWeight[2 * i + 2]; + } + } + } + + + printf("|CritcalPath | %.2e | %.2e | %.2e |\n", crPathWeight[0], crPathCost[0], 1e-9 * crPathWeight[0] / crPathCost[0] ); + + double prsnCoeff = pearsonCoeff(numForests, gFrstCost, gFrstWt); + printf("|Pearsoncoefficient | %.3f |\n", prsnCoeff); + + printf("\n~~~mermaid \n"); + printf("\ngantt \n \ + \t\t dateFormat mm-ss \n\ + \t\t title TreeCost and Time Gantt Chart\n\n\n" ); + printf("\t Section Time\n"); + printGantt(0, numForests, "Time", 1.0 , gFrstCostAcc, crPathCost); + printf("\t Section Weight\n"); + printGantt(0, numForests, "weight", crPathCost[0]/crPathWeight[0] , gFrstWt, crPathWeight); + + printf("~~~\n\n\n"); + SUPERLU_FREE(crPathCost); + SUPERLU_FREE(crPathWeight); + + } + + SUPERLU_FREE( gFrstCost); + SUPERLU_FREE( gFrstCostAcc); + SUPERLU_FREE( gFrstWt); +} + + +void printGantt(int root, int numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost) +{ + + + if (2*root+1>=numForests) + { + /* if there are no more childrens*/ + printf("\t tree-%d \t:%s-%d, 0d, %.0fd \n", root,nodename, root, 100*scale*gFrstCostAcc[root] ); + } + else + { + printGantt(2*root+1, numForests, nodename, scale, gFrstCostAcc, crPathCost); + int depTree =crPathCost[2*root+1]> crPathCost[2*root+2]? 2*root+1:2*root+2; + printf("\t tree-%d %.2g \t:%s-%d, after %s-%d, %.0fd \n", root,100*scale*crPathCost[root], nodename, root, nodename, depTree, 100*scale*gFrstCostAcc[root] ); + printGantt(2*root+2, numForests, nodename, scale, gFrstCostAcc, crPathCost); + } +} + +#define ABS(a) ((a)<0?-(a):a) +double getLoadImbalance(int_t nTrees, + int_t * treeIndx, // index of tree in gtrees + double * gTreeWeights) +{ + + if (nTrees < 1) + { + /* code */ + return 0; + } + double w1 = 0; + double w2 = 0; + + int_t* wSortIdx = getSortIndexDouble(nTrees, gTreeWeights); + // can not change weight array + w1 = gTreeWeights[wSortIdx[nTrees - 1]]; + + + for (int i = nTrees - 2 ; i > -1; --i) + { + /* code */ + if (w1 > w2) + { + /* code */ + w2 += gTreeWeights[wSortIdx[i]]; + + } + else + { + w1 += gTreeWeights[wSortIdx[i]]; + + } + } + + SUPERLU_FREE(wSortIdx); + return ABS(w2 - w1) / (w2 + w1); + // return trPart; + +} + + +// maximum allowed imbalance +#define ACCEPTABLE_TREE_IMBALANCE 0.2 + + +// r forest contains a list of tree heads +// each treehead is an entire subtree (all level beloe) +#define MAX_TREE_ALLOWED 1024 + +typedef struct +{ + int_t ntrees; + int_t* treeHeads; +} rForest_t; + +typedef struct +{ + sForest_t* Ans; + rForest_t* S[2]; +} forestPartition_t; + +void freeRforest(rForest_t* rforest) +{ + SUPERLU_FREE(rforest->treeHeads); +} + + +sForest_t* createForestNew(int_t numTrees, int_t nsupers, int_t * nodeCounts, int_t** NodeLists, int_t * setree, treeList_t* treeList) +{ + if (numTrees == 0) return NULL; + + sForest_t* forest = SUPERLU_MALLOC(sizeof(sForest_t)); + forest->numTrees = numTrees; + + double frWeight = 0; + int_t nodecount = 0; + for (int_t i = 0; i < numTrees; ++i) + { + nodecount += nodeCounts[i]; + frWeight += calcNodeListWeight(nodeCounts[i], NodeLists[i], treeList); + } + + forest->nNodes = nodecount; + forest->weight = frWeight; + + int_t* nodeList = INT_T_ALLOC(forest->nNodes); + + int_t ptr = 0; + for (int_t i = 0; i < numTrees; ++i) + { + for (int_t j = 0; j < nodeCounts[i]; ++j) + { + /* copy the loop */ + nodeList[ptr] = NodeLists[i][j]; + ptr++; + } + } + + forest->nodeList = nodeList; + forest->cost = 0.0; + + + // using the nodelist create factorization ordering + calcTopInfoForest(forest, nsupers, setree); + + return forest; +} + +void oneLeveltreeFrPartition( int_t nTrees, int_t * trCount, int_t** trList, + int_t * treeSet, + double * sWeightArr) +{ + if (nTrees < 1) + { + /* code */ + trCount[0] = 0; + trCount[1] = 0; + return; + } + double w1 = 0; + double w2 = 0; + + int_t* wSortIdx = getSortIndexDouble(nTrees, sWeightArr); + // treeIndx= permuteArr(nTrees, treeIndx, wSortIdx); + + int_t S1ptr = 0; + int_t S2ptr = 0; + + // can not change weight array + w1 = sWeightArr[wSortIdx[nTrees - 1]]; + trList[0][S1ptr++] = treeSet[wSortIdx[nTrees - 1]]; + + for (int i = nTrees - 2 ; i > -1; --i) + { + /* code */ + if (w1 > w2) + { + /* code */ + w2 += sWeightArr[wSortIdx[i]]; + trList[1][S2ptr++] = treeSet[wSortIdx[i]]; + } + else + { + w1 += sWeightArr[wSortIdx[i]]; + trList[0][S1ptr++] = treeSet[wSortIdx[i]]; + } + } + + trCount[0] = S1ptr; + trCount[1] = S2ptr; + + SUPERLU_FREE(wSortIdx); + +} /* oneLeveltreeFrPartition */ + +forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + int_t nTreeSet = rforest->ntrees; + int_t* treeHeads = rforest->treeHeads; + + int_t nAnc = 0; +#if 0 + int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); + int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + double * weightArr = DOUBLE_ALLOC (MAX_TREE_ALLOWED); + // int_t* treeSet = INT_T_ALLOC(nTreeSet); + int_t* treeSet = INT_T_ALLOC(MAX_TREE_ALLOWED); +#else // Sherry fix + int_t* ancTreeCount = intMalloc_dist(MAX_TREE_ALLOWED); + int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + double * weightArr = doubleMalloc_dist(MAX_TREE_ALLOWED); + int_t* treeSet = intMalloc_dist(MAX_TREE_ALLOWED); +#endif + + for (int i = 0; i < nTreeSet; ++i) + { + treeSet[i] = treeHeads[i]; + weightArr[i] = treeList[treeHeads[i]].iWeight; + } + + while (getLoadImbalance(nTreeSet, treeSet, weightArr) > ACCEPTABLE_TREE_IMBALANCE ) + { + // get index of maximum weight subtree + int_t idx = 0; + for (int i = 0; i < nTreeSet; ++i) + { + /* code */ + if (treeList[treeSet[i]].iWeight > treeList[treeSet[idx]].iWeight) + { + /* code */ + idx = i; + } + } + + + int_t MaxTree = treeSet[idx]; + int_t* sroots = getSubTreeRoots(MaxTree, treeList); + if (sroots[0] == -1) + { + /* code */ + SUPERLU_FREE(sroots); + break; + } + + ancTreeCount[nAnc] = getCommonAncsCount(MaxTree, treeList); + //int_t * alist = INT_T_ALLOC (ancTreeCount[nAnc]); + int_t * alist = intMalloc_dist(ancTreeCount[nAnc]); + getCommonAncestorList(MaxTree, alist, setree, treeList); + ancNodeLists[nAnc] = alist; + nAnc++; + + + treeSet[idx] = treeSet[nTreeSet - 1]; + weightArr[idx] = treeList[treeSet[idx]].iWeight; + treeSet[nTreeSet - 1] = sroots[0]; + weightArr[nTreeSet - 1] = treeList[treeSet[nTreeSet - 1]].iWeight; + treeSet[nTreeSet] = sroots[1]; + weightArr[nTreeSet] = treeList[treeSet[nTreeSet]].iWeight; + nTreeSet += 1; + + SUPERLU_FREE(sroots); + + if (nTreeSet == MAX_TREE_ALLOWED) + { + break; + } + } + + // Create the Ancestor forest + sForest_t* aforest = createForestNew(nAnc, nsupers, ancTreeCount, ancNodeLists, setree, treeList); + + // create the weight array; + //double* sWeightArr = DOUBLE_ALLOC(nTreeSet); + double* sWeightArr = doubleMalloc_dist(nTreeSet); // Sherry fix + for (int i = 0; i < nTreeSet ; ++i) + sWeightArr[i] = treeList[treeSet[i]].iWeight; + + int_t trCount[2] = {0, 0}; + int_t* trList[2]; +#if 0 + trList[0] = INT_T_ALLOC(nTreeSet); + trList[1] = INT_T_ALLOC(nTreeSet); +#else // Sherry fix + trList[0] = intMalloc_dist(nTreeSet); + trList[1] = intMalloc_dist(nTreeSet); +#endif + + oneLeveltreeFrPartition( nTreeSet, trCount, trList, + treeSet, + sWeightArr); + + rForest_t *rforestS1, *rforestS2; +#if 0 + rforestS1 = SUPERLU_MALLOC(sizeof(rforest)); + rforestS2 = SUPERLU_MALLOC(sizeof(rforest)); +#else + rforestS1 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t)); // Sherry fix + rforestS2 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t)); +#endif + + rforestS1->ntrees = trCount[0]; + rforestS1->treeHeads = trList[0]; + + rforestS2->ntrees = trCount[1]; + rforestS2->treeHeads = trList[1]; + + forestPartition_t frPr_t; + frPr_t.Ans = aforest; + frPr_t.S[0] = rforestS1; + frPr_t.S[1] = rforestS2; + + SUPERLU_FREE(weightArr); + SUPERLU_FREE(treeSet); + SUPERLU_FREE(sWeightArr); + + // free stuff + // int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); + // int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + for (int i = 0; i < nAnc ; ++i) + { + /* code */ + SUPERLU_FREE(ancNodeLists[i]); + } + + SUPERLU_FREE(ancTreeCount); + SUPERLU_FREE(ancNodeLists); + + return frPr_t; +} /* iterativeFrPartitioning */ + + +/* Create a single sforest */ +sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + int_t nTree = rforest->ntrees; + + // quick return + if (nTree < 1) return NULL; + + int_t* treeHeads = rforest->treeHeads; + int_t* nodeCounts = INT_T_ALLOC(nTree); + int_t** NodeLists = SUPERLU_MALLOC(nTree * sizeof(int_t*)); + + for (int i = 0; i < nTree; ++i) + { + /* code */ + nodeCounts[i] = treeList[treeHeads[i]].numDescendents; + NodeLists[i] = INT_T_ALLOC(nodeCounts[i]); + getDescendList(treeHeads[i], NodeLists[i], treeList); + } + + + sForest_t* sforest = createForestNew(nTree, nsupers, nodeCounts, NodeLists, setree, treeList); + + for (int i = 0; i < nTree; ++i) + { + /* code */ + SUPERLU_FREE(NodeLists[i]); + } + + SUPERLU_FREE(NodeLists); + SUPERLU_FREE(nodeCounts); + + return sforest; +} /* r2sForest */ + + +sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + // assert(maxLvl == 2); + int_t numForests = (1 << maxLvl) - 1; + sForest_t** sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ; + rForest_t* rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t)); + + // intialize rfortes[0] + int_t nRootTrees = 0; + + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) nRootTrees++; + + } + + rForests[0].ntrees = nRootTrees; + rForests[0].treeHeads = INT_T_ALLOC(nRootTrees); + + nRootTrees = 0; + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + rForests[0].treeHeads[nRootTrees] = i; + nRootTrees++; + } + + } + + if (maxLvl == 1) + { + /* code */ + sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList); + + freeRforest(&rForests[0]); // sherry added + SUPERLU_FREE(rForests); + return sForests; + } + + // now loop over level + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* loop over all r forest in this level */ + int_t lvlSt = (1 << lvl) - 1; + int_t lvlEnd = (1 << (lvl + 1)) - 1; + + for (int_t tr = lvlSt; tr < lvlEnd; ++tr) + { + /* code */ + forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); + sForests[tr] = frPr_t.Ans; + + if (lvl == maxLvl - 2) { + /* code */ + sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); + sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); + freeRforest(frPr_t.S[0]); // Sherry added + freeRforest(frPr_t.S[1]); +#if 0 + SUPERLU_FREE(frPr_t.S[0]); // Sherry added + SUPERLU_FREE(frPr_t.S[1]); +#endif + } else { + rForests[2 * tr + 1] = *(frPr_t.S[0]); + rForests[2 * tr + 2] = *(frPr_t.S[1]); + + } + SUPERLU_FREE(frPr_t.S[0]); // Sherry added + SUPERLU_FREE(frPr_t.S[1]); + } + + } + + for (int i = 0; i < numRForests; ++i) + { + /* code */ + freeRforest(&rForests[i]); // Sherry added + } + + SUPERLU_FREE(rForests); // Sherry added + + return sForests; + +} /* getGreedyLoadBalForests */ + +// balanced forests at one level +sForest_t** getOneLevelBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + // assert(maxLvl == 2); + int_t numForests = (1 << maxLvl) - 1; + sForest_t** sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ; + rForest_t* rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t)); + + // intialize rfortes[0] + int_t nRootTrees = 0; + + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + nRootTrees += 2; + } + + } + + rForests[0].ntrees = nRootTrees; + rForests[0].treeHeads = INT_T_ALLOC(nRootTrees); + + nRootTrees = 0; + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + rForests[0].treeHeads[nRootTrees] = i; + nRootTrees++; + } + } + + if (maxLvl == 1) + { + /* code */ + sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList); + return sForests; + } + + // now loop over level + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* loop over all r forest in this level */ + int_t lvlSt = (1 << lvl) - 1; + int_t lvlEnd = (1 << (lvl + 1)) - 1; + + for (int_t tr = lvlSt; tr < lvlEnd; ++tr) + { + /* code */ + forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); + sForests[tr] = frPr_t.Ans; + + if (lvl == maxLvl - 2) + { + /* code */ + sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); + sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); + } + else + { + rForests[2 * tr + 1] = *(frPr_t.S[0]); + rForests[2 * tr + 2] = *(frPr_t.S[1]); + } + + } + + } + + for (int i = 0; i < numRForests; ++i) + { + /* code */ + freeRforest(&rForests[i]); + } + + SUPERLU_FREE(rForests); + + + + return sForests; + +} diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c new file mode 100644 index 00000000..c683feaf --- /dev/null +++ b/SRC/supernodal_etree.c @@ -0,0 +1,1060 @@ +/*! @file + * \brief function to generate supernodal etree + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + +#include +#include +#include "superlu_ddefs.h" +//#include "supernodal_etree.h" + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +int_t log2i(int_t index) +{ + int_t targetlevel = 0; + while (index >>= 1) ++targetlevel; + return targetlevel; +} + +/** + * Returns Supernodal Elimination Tree + * @param nsuper Number of Supernodes + * @param etree Scalar elimination tree + * @param supno Vertex to supernode mapping + * @param xsup Supernodal boundaries + * @return Supernodal elimination tree + */ +int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup) +{ + // int_t *setree = malloc(sizeof(int_t) * nsuper); + int_t *setree = intMalloc_dist(nsuper); // Sherry fix + /*initialzing the loop*/ + for (int i = 0; i < nsuper; ++i) + { + setree[i] = nsuper; + } + /*calculating the setree*/ + for (int i = 0; i < nsuper - 1; ++i) + { + int_t ftree = etree[xsup[i + 1] - 1]; + if (ftree < xsup[nsuper]) + { + setree[i] = supno[etree[xsup[i + 1] - 1]]; + } + } + return setree; +} +/*takes supernodal elimination tree and for each +supernode calculates "level" in elimination tree*/ +int_t* topological_ordering(int_t nsuper, int_t* setree) +{ + // int_t *tsort_setree = malloc(sizeof(int_t) * nsuper); + int_t *tsort_setree = intMalloc_dist(nsuper); // Sherry fix + for (int i = 0; i < nsuper; ++i) + { + tsort_setree[i] = 0; /*initializing all levels to zero*/ + } + for (int i = 0; i < nsuper - 1; ++i) + { + /*level of parent = MAX(level_of_children()+1)*/ + tsort_setree[setree[i]] = SUPERLU_MAX(tsort_setree[setree[i]], tsort_setree[i] + 1); + } + return tsort_setree; +} + + +treeList_t* setree2list(int_t nsuper, int_t* setree ) +{ + treeList_t* treeList = (treeList_t* ) SUPERLU_MALLOC (sizeof(treeList_t) * (nsuper + 1)); + + // initialize the struct + for (int i = 0; i < nsuper + 1; ++i) + { + treeList[i].numChild = 0; + treeList[i].numDescendents = 1; /*numdescen includes myself*/ + treeList[i].left = -1; + treeList[i].right = -1; + treeList[i].right = -1; + treeList[i].depth = 0; + } + for (int i = 0; i < nsuper; ++i) + { + // updating i-th supernodes parents + int_t parenti = setree[i]; + treeList[parenti].numDescendents += treeList[i].numDescendents; + treeList[parenti].numChild++; + } + + /*allocate memory for children lists*/ + for (int i = 0; i < nsuper + 1; ++i) + { + //treeList[i].childrenList = INT_T_ALLOC (treeList[i].numChild); + treeList[i].childrenList = intMalloc_dist(treeList[i].numChild); + treeList[i].numChild = 0; + } + + for (int i = 0; i < nsuper; ++i) + { + // updating i-th supernodes parents + int_t parenti = setree[i]; + treeList[parenti].childrenList[treeList[parenti].numChild] = i; + treeList[parenti].numChild++; + } + + return treeList; + +} /* setree2list */ + +// Sherry added +int free_treelist(int_t nsuper, treeList_t* treeList) +{ + for (int i = 0; i < nsuper + 1; ++i) { + SUPERLU_FREE(treeList[i].childrenList); + } + SUPERLU_FREE(treeList); + return 0; +} + +int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) +{ + if (getenv("WF")) + { + if (strcmp(getenv("WF"), "One" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = 1.0; + } + } + else if (strcmp(getenv("WF"), "Ns" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double sz = 1.0 * SuperSize(i); + treeList[i].weight = sz; + } + } + else if (strcmp(getenv("WF"), "NsDep" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double dep = 1.0 * treeList[i].depth ; + double sz = 1.0 * SuperSize(i); + treeList[i].weight = sz * dep; + } + } + else if (strcmp(getenv("WF"), "NsDep2" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double dep = 1.0 * treeList[i].depth ; + double sz = 1.0 * SuperSize(i); + treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ; + + } + + } + else + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = treeList[i].scuWeight; + } + } + } + else + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = treeList[i].scuWeight; + + } + } + + return 0; +} /* estimateWeight */ + + +int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) +{ + + // initializing naive weight + for (int i = 0; i < nsupers; ++i) + { + treeList[i].depth = 0; + } + + for (int i = nsupers - 1; i > -1; --i) + { + /* code */ + int_t myDep = treeList[i].depth; + for (int cIdx = 0; cIdx < treeList[i].numChild; ++cIdx) + { + /* code */ + int_t child = treeList[i].childrenList[cIdx]; + treeList[child].depth = myDep + SuperSize(i) ; + + } + } + + + // for (int i = 0; i < nsupers; ++i) + // { + + // // treeList[i].weight = 1.0 * treeList[i].numDescendents; + // double dep = 1.0 * treeList[i].depth ; + // double sz = 1.0 * SuperSize(i); + // treeList[i].weight = 1.0; + // treeList[i].weight = sz; + // treeList[i].weight = sz * sz * sz; + // treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ; + // treeList[i].weight = treeList[i].scuWeight; + // // treeList[i].treeWeight = treeList[i].weight; + // // treeList[i].depth = 0; + // } + + estimateWeight(nsupers, setree, treeList, xsup); + + for (int i = 0; i < nsupers; ++i) + { + treeList[i].iWeight = treeList[i].weight; + } + + + for (int i = 0; i < nsupers; ++i) + { + int_t parenti = setree[i]; + treeList[parenti].iWeight += treeList[i].iWeight; + } + + + return 0; + +} /* calcTreeWeight */ + + +int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree) +{ + FILE* fp = fopen(sname, "w"); + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + for (int i = 0; i < nnodes; ++i) + { + /* code */ + fprintf(fp, IFMT " -> " IFMT ";\n", dlist[i], setree[dlist[i]]); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); + return 0; +} + +int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList) +// post order traversal +{ + if (k < 0) return 0; + + int_t cDesc = 0; + + for (int_t child = 0; child < treeList[k].numChild; ++child) + { + /* code */ + int_t nChild = treeList[k].childrenList[child]; + cDesc += getDescendList(nChild, dlist + cDesc, treeList); + } + + dlist[cDesc] = k; + return cDesc + 1; +} + + +int_t getCommonAncsCount(int_t k, treeList_t* treeList) +{ + // given a supernode k, give me the list of ancestors nodes + int_t cur = k; + int_t count = 1; + while (treeList[cur].numChild == 1) + { + cur = treeList[cur].childrenList[0]; + count++; + } + return count; +} +int_t getCommonAncestorList(int_t k, int_t* alist, int_t* seTree, treeList_t* treeList) +{ + // given a supernode k, give me the list of ancestors nodes + int_t cur = k; + int_t count = 1; + while (treeList[cur].numChild == 1) + { + cur = treeList[cur].childrenList[0]; + count++; + } + + + alist[0] = cur; + for (int i = 1; i < count; ++i) + { + /* code */ + alist[i] = seTree[cur]; + cur = seTree[cur]; + } + return count; +} + +int cmpfunc (const void * a, const void * b) +{ + return ( *(int_t*)a - * (int_t*)b ); +} + +int_t* getPermNodeList(int_t nnode, // number of nodes + int_t* nlist, int_t* perm_c_sup, int_t* iperm_c_sup) +//from list of nodes, get permutation of factorization +{ + int_t* perm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode); + int_t* iperm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode); + for (int_t i = 0; i < nnode; ++i) + { + /* code */ + // printf("%d %d %d\n",i, nlist[i],iperm_c_sup[nlist[i]] ); + iperm_l[i] = iperm_c_sup[nlist[i]]; //order of factorization + } + qsort(iperm_l, nnode, sizeof(int_t), cmpfunc); + + for (int_t i = 0; i < nnode; ++i) + { + /* code */ + perm_l[i] = perm_c_sup[iperm_l[i]]; //order of factorization + } + SUPERLU_FREE(iperm_l); + return perm_l; +} +int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder) +// calculates EtreeLB boundaries for given list of nodes, via perm_l +{ + //calculate minimum and maximum topOrder + int minTop, maxTop; + minTop = gTopOrder[perm_l[0]]; + maxTop = gTopOrder[perm_l[nnodes - 1]]; + int numLB = maxTop - minTop + 2; + //int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB); + int_t* lEtreeLB = (int_t *) intMalloc_dist(numLB); // Sherry fix + for (int i = 0; i < numLB; ++i) + { + /* initalize */ + lEtreeLB[i] = 0; + } + lEtreeLB[0] = 0; + int curLevel = minTop; + int curPtr = 1; + for (int i = 0; i < nnodes ; ++i) + { + /* code */ + if (curLevel != gTopOrder[perm_l[i]]) + { + /* creset */ + curLevel = gTopOrder[perm_l[i]]; + lEtreeLB[curPtr] = i; + curPtr++; + } + } + lEtreeLB[curPtr] = lEtreeLB[curPtr - 1] + 1; + printf("numLB=%d curPtr=%d \n", numLB, curPtr); + for (int i = 0; i < numLB; ++i) + { + printf(IFMT, lEtreeLB[i]); + } + + return lEtreeLB; +} + +int_t* getSubTreeRoots(int_t k, treeList_t* treeList) +{ + int_t* srootList = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * 2); + int_t cur = k; + while (treeList[cur].numChild == 1 && cur > 0) + { + cur = treeList[cur].childrenList[0]; + } + + if (treeList[cur].numChild == 2) + { + /* code */ + srootList[0] = treeList[cur].childrenList[0]; + srootList[1] = treeList[cur].childrenList[1]; + // printf("Last node =%d, numchilds=%d, desc[%d] = %d, desc[%d] = %d \n ", + // cur, treeList[cur].numChild, + // srootList[0], treeList[srootList[0]].numDescendents, + // srootList[1], treeList[srootList[1]].numDescendents ); + } + else + { + /* code */ + srootList[0] = -1; + srootList[1] = -1; + } + + return srootList; +} + +int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount) +// tests disjoint and union +{ + //int_t* slist = (int_t* ) malloc(sizeof(int_t) * nsupers); + int_t* slist = intMalloc_dist(nsupers); // Sherry fix + /*intialize each entry with zero */ + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + slist[i] = 0; + } + for (int_t list = 0; list < numList; ++list) + { + /* code */ + for (int_t nd = 0; nd < nodeCount[list]; ++nd) + { + slist[nodeList[list][nd]]++; + } + } + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + assert(slist[i] == 1); + } + printf("testSubtreeNodelist Passed\n"); + SUPERLU_FREE(slist); + return 0; +} +int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel) +{ + // checking monotonicity + for (int i = 0; i < nodeCount - 1; ++i) + { + if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]])) + { + /* code */ + printf("%d :" IFMT "(" IFMT ")" IFMT "(" IFMT ")\n", i, + permList[i], gTopLevel[permList[i]], + permList[i + 1], gTopLevel[permList[i + 1]] ); + } + assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]); + } +#if 0 + int_t* slist = (int_t* ) malloc(sizeof(int_t) * nodeCount); + int_t* plist = (int_t* ) malloc(sizeof(int_t) * nodeCount); +#else + int_t* slist = intMalloc_dist(nodeCount); + int_t* plist = intMalloc_dist(nodeCount); +#endif + // copy lists + for (int_t i = 0; i < nodeCount; ++i) + { + slist[i] = nodeList[i]; + plist[i] = permList[i]; + } + // sort them + qsort(slist, nodeCount, sizeof(int_t), cmpfunc); + qsort(plist, nodeCount, sizeof(int_t), cmpfunc); + for (int_t i = 0; i < nodeCount; ++i) + { + assert( slist[i] == plist[i]); + } + printf("permList Test Passed\n"); + + SUPERLU_FREE(slist); + SUPERLU_FREE(plist); + + return 0; +} + + +int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes); + +// Sherry: the following routine is not called ?? +int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) +{ + // merges three permutations + int_t nn = 0; + //add permutations + for (int i = 0; i < nperms; ++i) + { + nn += nnodes[i]; + } + + // alloc address + //int_t* gperm = (int_t*) malloc(nn * sizeof(int_t)); + int_t* gperm = intMalloc_dist(nn); // Sherry fix + + //now concatenat arrays + int ptr = 0; + for (int tr = 0; tr < nperms; ++tr) + { + /* code */ + for (int nd = 0; nd < nnodes[tr]; ++nd) + { + /* code */ + gperm[ptr] = perms[tr][nd]; + printf("%d %d %d" IFMT "\n", tr, ptr, nd, perms[tr][nd] ); + ptr++; + } + } + mergPermTest( nperms, gperm, nnodes); + return gperm; +} /* merg_perms */ + +int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) +{ + // merges three permutations + int_t nn = 0; + //add permutations + for (int i = 0; i < nperms; ++i) + { + nn += nnodes[i]; + } + + // alloc address + // int_t* tperm = (int_t*) malloc(nn * sizeof(int_t)); + int_t* tperm = intMalloc_dist(nn); // Sherry fix + + for (int i = 0; i < nn; ++i) + { + tperm[i] = 0; + } + for (int i = 0; i < nn; ++i) + { + /* code */ + printf("%d" IFMT "\n", i, gperms[i] ); + tperm[gperms[i]]++; + } + for (int i = 0; i < nn; ++i) + { + /* code */ + assert(tperm[i] == 1); + } + SUPERLU_FREE(tperm); + return nn; +} /* mergPermTest */ + +#if 0 // Sherry: not called anymore +int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, + superlu_dist_options_t *options, + LocalLU_t *Llu, int_t* xsup, + int_t num_look_aheads, int_t nsupers, int_t * iperm_c_supno) +{ + /* constructing look-ahead table to indicate the last dependency */ + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t ncb = nsupers / Pc; + int_t nrb = nsupers / Pr; + stat->num_look_aheads = num_look_aheads; + int* look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int)); + int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (int_t lb = 0; lb < nsupers; lb++) + look_ahead_l[lb] = -1; + /* go through U-factor */ + for (int_t lb = 0; lb < nrb; ++lb) + { + int_t ib = lb * Pr + myrow; + int_t* index = Llu->Ufstnz_br_ptr[lb]; + if (index) /* Not an empty row */ + { + int_t k = BR_HEADER; + for (int_t j = 0; j < index[0]; ++j) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (myrow < nsupers % grid->nprow) + { + int_t ib = nrb * Pr + myrow; + int_t* index = Llu->Ufstnz_br_ptr[nrb]; + if (index) /* Not an empty row */ + { + int_t k = BR_HEADER; + for (int_t j = 0; j < index[0]; ++j) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (options->SymPattern == NO) + { + /* go through L-factor */ + for (int_t lb = 0; lb < ncb; lb++) + { + int_t ib = lb * Pc + mycol; + int_t* index = Llu->Lrowind_bc_ptr[lb]; + if (index) + { + int_t k = BC_HEADER; + for (int_t j = 0; j < index[0]; j++) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) + { + int_t ib = ncb * Pc + mycol; + int_t* index = Llu->Lrowind_bc_ptr[ncb]; + if (index) + { + int_t k = BC_HEADER; + for (int_t j = 0; j < index[0]; j++) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + } + MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, + grid->comm); + SUPERLU_FREE (look_ahead_l); + return look_ahead; +} + +int* getLastDepBtree( int_t nsupers, treeList_t* treeList) +{ + int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (int i = 0; i < nsupers; ++i) + { + look_ahead[i] = -1; + } + for (int k = 0; k < nsupers; ++k) + { + /* code */ + for (int_t child = 0; child < treeList[k].numChild; ++child) + { + /* code */ + switch ( child) + { + case 0: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].left); + break; + case 1: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].right); + break; + case 2: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].extra); + break; + default: + break; + } + } + } + return look_ahead; +} + +#endif // Sherry: not called anymore + + +int_t* getGlobal_iperm(int_t nsupers, int_t nperms, // number of permutations + int_t** perms, // array of permutations + int_t* nnodes // number of nodes in each permutation + ) +{ + int_t* gperm = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + int_t* giperm = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + int_t ptr = 0; + for (int_t perm = 0; perm < nperms; ++perm) + { + /* code */ + for (int_t node = 0; node < nnodes[perm]; ++node) + { + /* code */ + gperm[ptr] = perms[perm][node]; + ptr++; + } + } + assert(ptr == nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + giperm[gperm[i]] = i; + } + SUPERLU_FREE(gperm); + return giperm; +} +int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t* treeHeads = SUPERLU_MALLOC (numTrees * sizeof (int_t)); + // for (int i = 0; i < numTrees; ++i) + // { + // /* code */ + // treeHeads[i]=0; + // } + treeHeads[0] = nsupers - 1; + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* code */ + int_t st = (1 << lvl) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + int_t * sroots; + sroots = getSubTreeRoots(treeHeads[i], treeList); + treeHeads[2 * i + 1] = sroots[0]; + treeHeads[2 * i + 2] = sroots[1]; + SUPERLU_FREE(sroots); + } + } + return treeHeads; +} + +int_t* calcNumNodes(int_t maxLvl, int_t* treeHeads, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t* nnodes = SUPERLU_MALLOC (numTrees * sizeof (int_t)); + for (int_t i = 0; i < numTrees; ++i) + { + /* code */ + if (treeHeads[i] > -1) + { + /* code */ + nnodes[i] = treeList[treeHeads[i]].numDescendents; + } + else + { + nnodes[i] = 0; + } + + } + for (int_t i = 0; i < numTrees / 2 ; ++i) + { + /* code */ + nnodes[i] -= (nnodes[2 * i + 1] + nnodes[2 * i + 2]); + } + return nnodes; +} + +int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes, + int_t* treeHeads, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t** nodeList = SUPERLU_MALLOC (numTrees * sizeof (int_t*)); + for (int_t i = 0; i < numTrees; ++i) + { + /* code */ + if (nnodes[i] > 0) + { + nodeList[i] = SUPERLU_MALLOC (nnodes[i] * sizeof (int_t)); + assert(nodeList[i]); + } + else + { + nodeList[i] = NULL; + } + + } + + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* code */ + int_t st = (1 << lvl) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + if (nodeList[i]) + getCommonAncestorList(treeHeads[i], nodeList[i], setree, treeList); + } + } + + int_t st = (1 << (maxLvl - 1)) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + getDescendList(treeHeads[i], nodeList[i], treeList); + } + return nodeList; +} + +int_t* getGridTrees( gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t* myTreeIdx = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t)); + myTreeIdx[0] = grid3d->zscp.Np - 1 + grid3d->zscp.Iam ; + for (int i = 1; i < maxLvl; ++i) + { + /* code */ + myTreeIdx[i] = (myTreeIdx[i - 1] - 1) / 2; + } + return myTreeIdx; +} + +int_t* getReplicatedTrees( gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t* myZeroTrIdxs = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t)); + for (int i = 0; i < maxLvl; ++i) + { + /* code */ + if (grid3d->zscp.Iam % (1 << i) ) + { + myZeroTrIdxs[i] = 1; + } + else + { + myZeroTrIdxs[i] = 0; + } + } + return myZeroTrIdxs; +} + + +int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm) +{ + if (nnodes < 0) return NULL; + int_t* myIperm = INT_T_ALLOC(nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + myIperm[i] = -1; + } + for (int_t i = 0; i < nnodes; ++i) + { + /* code */ + assert(myPerm[i] < nsupers); + myIperm[myPerm[i]] = i; + } + return myIperm; +} +int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree ) +{ + if (nnodes < 0) return NULL; + int_t* myTopOrder = INT_T_ALLOC(nnodes); + for (int_t i = 0; i < nnodes; ++i) + { + myTopOrder[i] = 0; /*initializing all levels to zero*/ + } + for (int_t i = 0; i < nnodes - 1; ++i) + { + /*level of parent = MAX(level_of_children()+1)*/ + int_t inode = myPerm[i]; + int_t iparent = setree[inode]; + int_t iparentIdx = myIperm[iparent]; + // if(iparentIdx >= nnodes) printf("%d %d %d %d \n", inode, iparent, nnodes, iparentIdx); + // assert(iparentIdx < nnodes); + // if (iparentIdx != -1) + if (0<= iparentIdx && iparentIdx 0) + { + /* code */ + st = Etree_LvlBdry[i]; + } + for (int_t j = st; j < nsuper; ++j) + { + /* code */ + if (tsort_etree[perm[j]] == i + 1) + { + /* code */ + Etree_LvlBdry[i + 1] = j; + break; + } + } + } + Etree_LvlBdry[max_level] = nsuper; + return Etree_LvlBdry; +} + +int_t* calculate_num_children(int_t nsuper, int_t* setree) +{ + //int_t* etree_num_children = malloc(sizeof(int_t) * (nsuper)); + int_t* etree_num_children = intMalloc_dist(nsuper); // Sherry fix + for (int_t i = 0; i < nsuper; ++i) + { + /*initialize num children to zero*/ + etree_num_children[i] = 0; + } + for (int_t i = 0; i < nsuper; i++) + { + if (setree[i] < nsuper) + etree_num_children[setree[i]]++; + } + return etree_num_children; +} +void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper) +{ + for (int i = 0; i < max_level; ++i) + { + int st = 0; + int ed = nsuper; + st = Etree_LvlBdry[i]; + ed = Etree_LvlBdry[i + 1]; + printf("Level %d, NumSuperNodes=%d,\t Start=%d end=%d\n", i, ed - st, st, ed); + } +} + +void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper) +{ + FILE* fp = fopen("output_sorted.dot", "w"); + int max_level = tsort_etree[nsuper - 1]; + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + fprintf(fp, "labelloc=\"t\";\n"); + fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level); + + for (int i = 0; i < nsuper - 1; ++i) + { + /* code */ + // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); + fprintf(fp, "%d -> " IFMT ";\n", i, setree[i]); + } + /*adding rank information*/ + for (int i = 0; i < max_level; ++i) + { + fprintf(fp, "{ rank=same; "); + for (int j = 0; j < nsuper; ++j) + { + if (tsort_etree[j] == i) + fprintf(fp, "%d ", j); + } + fprintf(fp, "}\n"); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} + + +void printEtree(int_t nsuper, int_t *setree, treeList_t* treeList) +{ + FILE* fp = fopen("output_sorted.dot", "w"); + // int_t max_level = tsort_etree[nsuper - 1]; + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + // fprintf(fp, "labelloc=\"t\";\n"); + // fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level); + + for (int i = 0; i < nsuper - 1; ++i) + { + /* code */ + // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); + fprintf(fp, " \"%d|%ld\" -> \"%ld|%ld\";\n", i, treeList[i].depth, + (long int) setree[i], (long int) treeList[setree[i]].depth); + } + + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} + + +void print_etree(int_t *setree, int_t* iperm, int_t nsuper) +{ + FILE* fp = fopen("output.dot", "w"); + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + for (int i = 0; i < nsuper; ++i) + { + /* code */ + fprintf(fp, IFMT " -> " IFMT ";\n", iperm[i], iperm[setree[i]]); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c new file mode 100644 index 00000000..4a27e531 --- /dev/null +++ b/SRC/sutil_dist.c @@ -0,0 +1,977 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Several matrix utilities + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * October 5, 2021
+ */
+
+#include 
+#include "superlu_sdefs.h"
+
+void
+sCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz,
+			    float *nzval, int_t *rowind, int_t *colptr,
+			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NCformat *) A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+sCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
+			       int_t nnz_loc, int_t m_loc, int_t fst_row,
+			       float *nzval, int_t *colind, int_t *rowptr,
+			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat_loc *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NRformat_loc *) A->Store;
+    Astore->nnz_loc = nnz_loc;
+    Astore->fst_row = fst_row;
+    Astore->m_loc = m_loc;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/*! \brief Convert a row compressed storage into a column compressed storage.
+ */
+void
+sCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz,
+                         float *a, int_t *colind, int_t *rowptr,
+                         float **at, int_t **rowind, int_t **colptr)
+{
+    register int i, j, col, relpos;
+    int_t *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (float *) floatMalloc_dist(nnz);
+    *rowind = intMalloc_dist(nnz);
+    *colptr = intMalloc_dist(n+1);
+    marker = intCalloc_dist(n);
+
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+/*! \brief Copy matrix A into matrix B. */
+void
+sCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void sPrint_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nCompCol matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NCformat *) A->Store;
+    printf("nrow %lld, ncol %lld, nnz %lld\n", (long long) A->nrow,
+	    (long long) A->ncol, (long long) Astore->nnz);
+    if ( (dp = (float *) Astore->nzval) != NULL ) {
+        printf("nzval:\n");
+        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
+    }
+    printf("\nrowind:\n");
+    for (i = 0; i < Astore->nnz; ++i)
+        printf("%lld  ", (long long) Astore->rowind[i]);
+    printf("\ncolptr:\n");
+    for (i = 0; i <= A->ncol; ++i)
+        printf("%lld  ", (long long) Astore->colptr[i]);
+    printf("\nend CompCol matrix.\n");
+}
+
+void sPrint_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nDense matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    printf("nrow %lld, ncol %lld, lda %lld\n",
+        (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
+    printf("\nend Dense matrix.\n");
+}
+
+int sPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc  *Astore;
+    int_t  nnz_loc, m_loc;
+    float  *dp;
+
+    printf("\n==== CompRowLoc matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    printf("nrow %ld, ncol %ld\n",
+            (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
+    PrintInt10("colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        Printfloat5("nzval", nnz_loc, dp);
+    printf("==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+int file_sPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
+{
+    NRformat_loc     *Astore;
+    int_t  nnz_loc, m_loc;
+    float       *dp;
+
+    fprintf(fp, "\n==== CompRowLoc matrix: ");
+    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    fprintf(fp, "nrow %ld, ncol %ld\n", (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    fprintf(fp, "nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
+    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        file_Printfloat5(fp, "nzval", nnz_loc, dp);
+    fprintf(fp, "==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+void
+sCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, float *x,
+			  int_t ldx, Stype_t stype, Dtype_t dtype,
+			  Mtype_t mtype)
+{
+    DNformat    *Xstore;
+
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (float *) x;
+}
+
+void
+sCopy_Dense_Matrix_dist(int_t M, int_t N, float *X, int_t ldx,
+			float *Y, int_t ldy)
+{
+/*! \brief
+ *
+ * 
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ * 
+ */ + int i, j; + + for (j = 0; j < N; ++j) + for (i = 0; i < M; ++i) + Y[i + j*ldy] = X[i + j*ldx]; +} + +void +sCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, + float *nzval, int_t *nzval_colptr, + int_t *rowind, int_t *rowind_colptr, + int_t *col_to_sup, int_t *sup_to_col, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + SCformat *Lstore; + + L->Stype = stype; + L->Dtype = dtype; + L->Mtype = mtype; + L->nrow = m; + L->ncol = n; + L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); + if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); + Lstore = L->Store; + Lstore->nnz = nnz; + Lstore->nsuper = col_to_sup[n]; + Lstore->nzval = nzval; + Lstore->nzval_colptr = nzval_colptr; + Lstore->rowind = rowind; + Lstore->rowind_colptr = rowind_colptr; + Lstore->col_to_sup = col_to_sup; + Lstore->sup_to_col = sup_to_col; + +} + +/**** The following utilities are added per request of SUNDIALS ****/ + +/*! \brief Clone: Allocate memory for a new matrix B, which is of the same type + * and shape as A. + * The clone operation would copy all the non-pointer structure members like + * nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store + * structure. It would also copy nnz_loc, m_loc, fst_row from A->Store + * into B->Store. It does not copy the matrix entries, row pointers, + * or column indices. + */ +void sClone_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B) +{ + NRformat_loc *Astore, *Bstore; + + B->Stype = A->Stype; + B->Dtype = A->Dtype; + B->Mtype = A->Mtype; + B->nrow = A->nrow;; + B->ncol = A->ncol; + Astore = (NRformat_loc *) A->Store; + B->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) ); + if ( !(B->Store) ) ABORT("SUPERLU_MALLOC fails for B->Store"); + Bstore = (NRformat_loc *) B->Store; + + Bstore->nnz_loc = Astore->nnz_loc; + Bstore->m_loc = Astore->m_loc; + Bstore->fst_row = Astore->fst_row; + if ( !(Bstore->nzval = (float *) floatMalloc_dist(Bstore->nnz_loc)) ) + ABORT("floatMalloc_dist fails for Bstore->nzval"); + if ( !(Bstore->colind = (int_t *) intMalloc_dist(Bstore->nnz_loc)) ) + ABORT("intMalloc_dist fails for Bstore->colind"); + if ( !(Bstore->rowptr = (int_t *) intMalloc_dist(Bstore->m_loc + 1)) ) + ABORT("intMalloc_dist fails for Bstore->rowptr"); + + return; +} + +/* \brief Copy: copies all entries, row pointers, and column indices of + * a matrix into another matrix of the same type, + * B_{i,j}=A_{i,j}, for i,j=1,...,n + */ +void sCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B) +{ + NRformat_loc *Astore, *Bstore; + + Astore = (NRformat_loc *) A->Store; + Bstore = (NRformat_loc *) B->Store; + + memcpy(Bstore->nzval, Astore->nzval, Astore->nnz_loc * sizeof(float)); + memcpy(Bstore->colind, Astore->colind, Astore->nnz_loc * sizeof(int_t)); + memcpy(Bstore->rowptr, Astore->rowptr, (Astore->m_loc+1) * sizeof(int_t)); + + return; +} + +/*! \brief Sets all entries of a matrix to zero, A_{i,j}=0, for i,j=1,..,n */ +void sZero_CompRowLoc_Matrix_dist(SuperMatrix *A) +{ + float zero = 0.0; + NRformat_loc *Astore = A->Store; + float *aval; + int_t i; + + aval = (float *) Astore->nzval; + for (i = 0; i < Astore->nnz_loc; ++i) aval[i] = zero; + + return; +} + +/*! \brief Scale and add I: scales a matrix and adds an identity. + * A_{i,j} = c * A_{i,j} + \delta_{i,j} for i,j=1,...,n and + * \delta_{i,j} is the Kronecker delta. + */ +void sScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *A, float c) +{ + float one = 1.0; + NRformat_loc *Astore = A->Store; + float *aval = (float *) Astore->nzval; + int i, j; + float temp; + + for (i = 0; i < Astore->m_loc; ++i) { /* Loop through each row */ + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + if ( (Astore->fst_row + i) == Astore->colind[j] ) { /* diagonal */ + temp = aval[j] * c; + aval[j] = temp + one; + } else { + aval[j] *= c; + } + } + } + + return; +} + +/*! \brief Scale and add: adds a scalar multiple of one matrix to another. + * A_{i,j} = c * A_{i,j} + B_{i,j}$ for i,j=1,...,n + */ +void sScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, float c) +{ + NRformat_loc *Astore = A->Store; + NRformat_loc *Bstore = B->Store; + float *aval = (float *) Astore->nzval, *bval = (float *) Bstore->nzval; + int_t i; + float temp; + + for (i = 0; i < Astore->nnz_loc; ++i) { /* Loop through each nonzero */ + aval[i] = c * aval[i] + bval[i]; + } + + return; +} +/**** end utilities added for SUNDIALS ****/ + +/*! \brief Allocate storage in ScalePermstruct */ +void sScalePermstructInit(const int_t m, const int_t n, + sScalePermstruct_t *ScalePermstruct) +{ + ScalePermstruct->DiagScale = NOEQUIL; + if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) + ABORT("Malloc fails for perm_r[]."); + if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) + ABORT("Malloc fails for perm_c[]."); +} + +/*! \brief Deallocate ScalePermstruct */ +void sScalePermstructFree(sScalePermstruct_t *ScalePermstruct) +{ + SUPERLU_FREE(ScalePermstruct->perm_r); + SUPERLU_FREE(ScalePermstruct->perm_c); + switch ( ScalePermstruct->DiagScale ) { + case ROW: + SUPERLU_FREE(ScalePermstruct->R); + break; + case COL: + SUPERLU_FREE(ScalePermstruct->C); + break; + case BOTH: + SUPERLU_FREE(ScalePermstruct->R); + SUPERLU_FREE(ScalePermstruct->C); + break; + default: break; + } +} + +/* + * The following are from 3D code p3dcomm.c + */ + +int sAllocGlu_3d(int_t n, int_t nsupers, sLUstruct_t * LUstruct) +{ + /*broadcasting Glu_persist*/ + LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1); + LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); + return 0; +} + +// Sherry added +/* Free the replicated data on 3D process layer that is not grid-0 */ +int sDeAllocGlu_3d(sLUstruct_t * LUstruct) +{ + SUPERLU_FREE(LUstruct->Glu_persist->xsup); + SUPERLU_FREE(LUstruct->Glu_persist->supno); + return 0; +} + +/* Free the replicated data on 3D process layer that is not grid-0 */ +int sDeAllocLlu_3d(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int i, nbc, nbr, nsupers; + sLocalLU_t *Llu = LUstruct->Llu; + + nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1; + + nbc = CEILING(nsupers, grid3d->npcol); + for (i = 0; i < nbc; ++i) + if ( Llu->Lrowind_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + SUPERLU_FREE (Llu->Lnzval_bc_ptr); + + nbr = CEILING(nsupers, grid3d->nprow); + for (i = 0; i < nbr; ++i) + if ( Llu->Ufstnz_br_ptr[i] ) { + SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE (Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE (Llu->Ufstnz_br_ptr); + SUPERLU_FREE (Llu->Unzval_br_ptr); + + /* The following can be freed after factorization. */ + SUPERLU_FREE(Llu->ToRecv); + SUPERLU_FREE(Llu->ToSendD); + for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]); + SUPERLU_FREE(Llu->ToSendR); + return 0; +} /* sDeAllocLlu_3d */ + + +/**** Other utilities ****/ +void +sGenXtrue_dist(int_t n, int_t nrhs, float *x, int_t ldx) +{ + int i, j; + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n; + else x[i + j*ldx] = 1.0 - (double)(i+1.)/n; + } +} + +/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's + */ +void +sFillRHS_dist(char *trans, int_t nrhs, float *x, int_t ldx, + SuperMatrix *A, float *rhs, int_t ldb) +{ + float one = 1.0; + float zero = 0.0; + + sp_sgemm_dist(trans, nrhs, one, A, x, ldx, zero, rhs, ldb); + +} + +/*! \brief Fills a float precision array with a given value. + */ +void +sfill_dist(float *a, int_t alen, float dval) +{ + register int_t i; + for (i = 0; i < alen; i++) a[i] = dval; +} + + + +/*! \brief Check the inf-norm of the error vector + */ +void sinf_norm_error_dist(int_t n, int_t nrhs, float *x, int_t ldx, + float *xtrue, int_t ldxtrue, + gridinfo_t *grid) +{ + double err, xnorm; + float *x_work, *xtrue_work; + int i, j; + + for (j = 0; j < nrhs; j++) { + x_work = &x[j*ldx]; + xtrue_work = &xtrue[j*ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + err = err / xnorm; + printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + } +} + +void Printfloat5(char *name, int_t len, float *x) +{ + register int_t i; + + printf("%10s:", name); + for (i = 0; i < len; ++i) { + if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4); + printf("%14e", x[i]); + } + printf("\n"); +} + +int file_Printfloat5(FILE *fp, char *name, int_t len, float *x) +{ + register int_t i; + + fprintf(fp, "%10s:", name); + for (i = 0; i < len; ++i) { + if ( i % 5 == 0 ) fprintf(fp, "\n[%ld-%ld] ", (long int) i, (long int) i+4); + fprintf(fp, "%14e", x[i]); + } + fprintf(fp, "\n"); + return 0; +} + +/*! \brief Print the blocks in the factored matrix L. + */ +void sPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, + Glu_persist_t *Glu_persist, sLocalLU_t *Llu) +{ + register int c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb; + register int_t k, mycol, r; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + + printf("\n[%d] L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam); + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + printf("[%d] block column %d (local # %d), nsupc %d, # row blocks %d\n", + iam, gb, lb, nsupc, nb); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", + iam, c, index[k], len); + PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]); + for (j = 0; j < nsupc; ++j) { + Printfloat5("nzval", len, &nzval[r + j*nsupr]); + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + printf("(%d)", iam); + PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); + PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); + } + printf("nfrecvx " IFMT "\n", Llu->nfrecvx); + k = CEILING( nsupers, grid->nprow ); + PrintInt10("fmod", k, Llu->fmod); + +} /* SPRINTLBLOCKS */ + + +/*! \brief Sets all entries of matrix L to zero. + */ +void sZeroLblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + float zero = 0.0; + register int extra, gb, j, lb, nsupc, nsupr, ncb; + register int_t k, mycol, r; + sLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + int_t nsupers = Glu_persist->supno[n-1] + 1; + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (j = 0; j < nsupc; ++j) { + for (r = 0; r < nsupr; ++r) { + nzval[r + j*nsupr] = zero; + } + } + } + } +} /* end sZeroLblocks */ + + +/*! \brief Dump the factored matrix L using matlab triple-let format + */ +void sDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, + Glu_persist_t *Glu_persist, sLocalLU_t *Llu) +{ + register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; + int k, mycol, r, n, nmax; + int_t nnzL; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + char filename[256]; + FILE *fp, *fopen(); + + // assert(grid->npcol*grid->nprow==1); + + // count nonzeros in the first pass + nnzL = 0; + n = 0; + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i=xsup[gb]+j+1){ + nnzL ++; + nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); + n = nmax; + } + + } + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + } + MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); + MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); + + snprintf(filename, sizeof(filename), "%s-%d", "L", iam); + printf("Dumping L factor to --> %s\n", filename); + if ( !(fp = fopen(filename, "w")) ) { + ABORT("File open failed"); + } + + if(grid->iam==0){ + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); + } + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; ixsup; + int_t *index; + float *nzval; + + printf("\n[%d] U BLOCKS IN ROW-MAJOR ORDER -->\n", iam); + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + nb = index[0]; + printf("[%d] block row " IFMT " (local # %d), # column blocks %d\n", + iam, lb*grid->nprow+myrow, lb, nb); + r = 0; + for (c = 0, k = BR_HEADER; c < nb; ++c) { + jb = index[k]; + len = index[k+1]; + printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", + iam, c, jb, index[k+1]); + nsupc = SuperSize( jb ); + PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]); + Printfloat5("nzval", len, &nzval[r]); + k += UB_DESCRIPTOR + nsupc; + r += len; + } + + printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]); + } + } +} /* end sPrintUlocks */ + +/*! \brief Sets all entries of matrix U to zero. + */ +void sZeroUblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + float zero = 0.0; + register int i, extra, lb, len, nrb; + register int myrow, r; + sLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + int nsupers = Glu_persist->supno[n-1] + 1; + + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + len = index[1]; // number of entries in nzval[]; + for (i = 0; i < len; ++i) { + nzval[i] = zero; + } + } + } +} /* end sZeroUlocks */ + +int +sprint_gsmv_comm(FILE *fp, int_t m_loc, psgsmv_comm_t *gsmv_comm, + gridinfo_t *grid) +{ + int_t procs = grid->nprow*grid->npcol; + fprintf(fp, "TotalIndSend " IFMT "\tTotalValSend " IFMT "\n", gsmv_comm->TotalIndSend, + gsmv_comm->TotalValSend); + file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start); + file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend); + file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv); + file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend); + file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv); + file_PrintInt32(fp, "SendCounts", procs, gsmv_comm->SendCounts); + file_PrintInt32(fp, "RecvCounts", procs, gsmv_comm->RecvCounts); + return 0; +} + + +void +sGenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist, + gridinfo_t *grid, float **xact, int *ldx, float **b, int *ldb) +{ + int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows, + nsupr, nsupers, rel; + int_t *supno, *xsup, *lxsup; + float *x, *bb; + NCformat *Astore; + float *aval; + + n = A->ncol; + *ldb = 0; + supno = Glu_persist->supno; + xsup = Glu_persist->xsup; + nsupers = supno[n-1] + 1; + iam = grid->iam; + myrow = MYROW( iam, grid ); + Astore = (NCformat *) A->Store; + aval = Astore->nzval; + lb = CEILING( nsupers, grid->nprow ) + 1; + if ( !(lxsup = intMalloc_dist(lb)) ) + ABORT("Malloc fails for lxsup[]."); + + lsup = 0; + nlrows = 0; + for (j = 0; j < nsupers; ++j) { + i = PROW( j, grid ); + if ( myrow == i ) { + nsupr = SuperSize( j ); + *ldb += nsupr; + lxsup[lsup++] = nlrows; + nlrows += nsupr; + } + } + *ldx = n; + if ( !(x = floatMalloc_dist(((size_t)*ldx) * nrhs)) ) + ABORT("Malloc fails for x[]."); + if ( !(bb = floatCalloc_dist(*ldb * nrhs)) ) + ABORT("Calloc fails for bb[]."); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0; + + /* Form b = A*x. */ + for (j = 0; j < n; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + gb = supno[irow]; + gbrow = PROW( gb, grid ); + if ( myrow == gbrow ) { + rel = irow - xsup[gb]; + lb = LBi( gb, grid ); + bb[lxsup[lb] + rel] += aval[i] * x[j]; + } + } + + /* Memory allocated but not freed: xact, b */ + *xact = x; + *b = bb; + + SUPERLU_FREE(lxsup); + +#if ( PRNTlevel>=2 ) + for (i = 0; i < grid->nprow*grid->npcol; ++i) { + if ( iam == i ) { + printf("\n(%d)\n", iam); + Printfloat5("rhs", *ldb, *b); + } + MPI_Barrier( grid->comm ); + } +#endif + +} /* GENXTRUERHS */ + +/* g5.rua + b = A*x y = L\b + 0 1 1.0000 + 1 0 0.2500 + 2 1 1.0000 + 3 2 2.0000 + 4 1 1.7500 + 5 1 1.8917 + 6 0 1.1879 + 7 2 2.0000 + 8 2 2.0000 + 9 1 1.0000 + 10 1 1.7500 + 11 0 0 + 12 1 1.8750 + 13 2 2.0000 + 14 1 1.0000 + 15 0 0.2500 + 16 1 1.7667 + 17 0 0.6419 + 18 1 2.2504 + 19 0 1.1563 + 20 0 0.9069 + 21 0 1.4269 + 22 1 2.7510 + 23 1 2.2289 + 24 0 2.4332 + + g6.rua + b=A*x y=L\b + 0 0 0 + 1 1 1.0000 + 2 1 1.0000 + 3 2 2.5000 + 4 0 0 + 5 2 2.0000 + 6 1 1.0000 + 7 1 1.7500 + 8 1 1.0000 + 9 0 0.2500 + 10 0 0.5667 + 11 1 2.0787 + 12 0 0.8011 + 13 1 1.9838 + 14 1 1.0000 + 15 1 1.0000 + 16 2 2.5000 + 17 0 0.8571 + 18 0 0 + 19 1 1.0000 + 20 0 0.2500 + 21 1 1.0000 + 22 2 2.0000 + 23 1 1.7500 + 24 1 1.8917 + 25 0 1.1879 + 26 0 0.8011 + 27 1 1.9861 + 28 1 2.0199 + 29 0 1.3620 + 30 0 0.6136 + 31 1 2.3677 + 32 0 1.1011 + 33 0 1.5258 + 34 0 1.7628 + 35 0 2.1658 +*/ diff --git a/SRC/symbfact.c b/SRC/symbfact.c index 648853a1..6da8a99a 100644 --- a/SRC/symbfact.c +++ b/SRC/symbfact.c @@ -185,6 +185,7 @@ int_t symbfact if ( !pnum && (options->PrintStat == YES)) { nnzLU = nnzL + nnzU - min_mn; + printf("\tMatrix size min_mn " IFMT "\n", min_mn); printf("\tNonzeros in L " IFMT "\n", nnzL); printf("\tNonzeros in U " IFMT "\n", nnzU); printf("\tnonzeros in L+U " IFMT "\n", nnzLU); @@ -528,8 +529,8 @@ static int_t column_dfs */ lsub[nextl++] = krow; /* krow is indexed into A */ if ( nextl >= nzlmax ) { - if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, - &nzlmax, Glu_freeable) ) + if ( (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, + &nzlmax, Glu_freeable)) ) return (mem_error); lsub = Glu_freeable->lsub; } @@ -570,10 +571,10 @@ static int_t column_dfs if ( chperm == EMPTY ) { lsub[nextl++] = kchild; if ( nextl >= nzlmax ) { - if ( mem_error = + if ( (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, &nzlmax, - Glu_freeable) ) + Glu_freeable)) ) return (mem_error); lsub = Glu_freeable->lsub; } @@ -786,8 +787,8 @@ static int_t set_usub new_next = nextu + nseg; while ( new_next > nzumax ) { - if (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax, - Glu_freeable)) + if ( (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax, + Glu_freeable)) ) return (mem_error); usub = Glu_freeable->usub; } diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c new file mode 100644 index 00000000..dd677995 --- /dev/null +++ b/SRC/treeFactorization.c @@ -0,0 +1,446 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief factorization routines in 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + + #include "superlu_ddefs.h" + #if 0 + #include "treeFactorization.h" + #include "trfCommWrapper.h" + #endif + + #if 0 /******** Sherry: Remove extra layer of function calls. *******/ + int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT, + int tag_ub + ) + { + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + MPI_Request * U_diag_blk_send_req = comReqs->U_diag_blk_send_req; + MPI_Request * L_diag_blk_send_req = comReqs->L_diag_blk_send_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + + double * BlockUFactor = dFBuf->BlockUFactor; + double * BlockLFactor = dFBuf->BlockLFactor; + dDiagFactIBCast(k, k, BlockUFactor, BlockLFactor, + IrecvPlcd_D, + U_diag_blk_recv_req, L_diag_blk_recv_req, + U_diag_blk_send_req, L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); + return 0; + } + int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) + { + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + + dLPanelUpdate( k, IrecvPlcd_D, factored_L, + U_diag_blk_recv_req, BlockUFactor, grid, LUstruct, SCT); + return 0; + } + + int_t sUPanelUpdate( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) + { + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + + int_t * factored_U = factStat->factored_U; + + double * BlockLFactor = dFBuf->BlockLFactor; + dUPanelUpdate(k, factored_U, L_diag_blk_recv_req, BlockLFactor, bigV, ldt, + Ublock_info, grid, LUstruct, stat, SCT); + return 0; + } + int_t sIBcastRecvLPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) + + { + int* msgcnt = msgs->msgcnt; + MPI_Request *send_req = comReqs->send_req; + MPI_Request *recv_req = comReqs->recv_req; + int_t * Lsub_buf = LUvsb->Lsub_buf; + double * Lval_buf = LUvsb->Lval_buf; + int_t* factored = factStat->factored; + dIBcastRecvLPanel(k, k, + msgcnt, send_req, recv_req, + Lsub_buf, Lval_buf, factored, grid, LUstruct, SCT, tag_ub); + return 0; +} + +int_t sIBcastRecvUPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) +{ + int* msgcnt = msgs->msgcnt; + MPI_Request *send_requ = comReqs->send_requ; + MPI_Request *recv_requ = comReqs->recv_requ; + int_t * Usub_buf = LUvsb->Usub_buf; + double * Uval_buf = LUvsb->Uval_buf; + dIBcastRecvUPanel(k, k, msgcnt, send_requ, recv_requ, Usub_buf, + Uval_buf, grid, LUstruct, SCT, tag_ub); + return 0; +} +int_t sWaitL(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int* msgcnt = msgs->msgcnt; + int* msgcntU = msgs->msgcntU; + MPI_Request *send_req = comReqs->send_req; + MPI_Request *recv_req = comReqs->recv_req; + dWaitL(k, msgcnt, msgcntU, send_req, recv_req, grid, LUstruct, SCT); + return 0; +} +int_t sWaitU(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int* msgcnt = msgs->msgcnt; + MPI_Request *send_requ = comReqs->send_requ; + MPI_Request *recv_requ = comReqs->recv_requ; + dWaitU(k, msgcnt, send_requ, recv_requ, grid, LUstruct, SCT); + return 0; +} +int_t sWait_LUDiagSend(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT) +{ + MPI_Request *U_diag_blk_send_req = comReqs->U_diag_blk_send_req; + MPI_Request *L_diag_blk_send_req = comReqs->L_diag_blk_send_req; + Wait_LUDiagSend(k, U_diag_blk_send_req, L_diag_blk_send_req, grid, SCT); + return 0; +} +int_t sSchurComplementSetup(int_t k, msgs_t* msgs, + packLUInfo_t* packLUInfo, + int_t* gIperm_c_supno, int_t*perm_c_supno, + factNodelists_t* fNlists, + scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, + gridinfo_t *grid, LUstruct_t *LUstruct) +{ + int_t * Lsub_buf = LUvsb->Lsub_buf; + double * Lval_buf = LUvsb->Lval_buf; + int_t * Usub_buf = LUvsb->Usub_buf; + double * Uval_buf = LUvsb->Uval_buf; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + Remain_info_t* Remain_info = packLUInfo->Remain_info; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int* msgcnt = msgs->msgcnt; + int_t* iperm_u = fNlists->iperm_u; + int_t* perm_u = fNlists->perm_u; + + double* bigU = scuBufs->bigU; + return dSchurComplementSetup(k, msgcnt, + Ublock_info, Remain_info, uPanelInfo, lPanelInfo, + gIperm_c_supno, iperm_u, perm_u, + bigU, Lsub_buf, Lval_buf, Usub_buf, Uval_buf, + grid, LUstruct); +} +int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct); + + return 0; +} + +int_t sUPanelTrSolve( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + double * BlockLFactor = dFBuf->BlockLFactor; + + dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, LUstruct, stat, SCT); + return 0; +} + +#endif /******** End removing extra layer of function calls. *******/ + + +int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid) +{ + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + // allocating MPI requests (for one) + comReqs->U_diag_blk_recv_req = MPI_REQ_ALLOC( 1 ); + comReqs->L_diag_blk_recv_req = MPI_REQ_ALLOC( 1 ); + comReqs->U_diag_blk_send_req = MPI_REQ_ALLOC( Pr ); + comReqs->L_diag_blk_send_req = MPI_REQ_ALLOC( Pc ); + comReqs->send_req = MPI_REQ_ALLOC(2 * Pc); + comReqs->recv_req = MPI_REQ_ALLOC(4); + comReqs->send_requ = MPI_REQ_ALLOC(2 * Pr); + comReqs->recv_requ = MPI_REQ_ALLOC(2); + return 0; +} + +commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +{ + commRequests_t** comReqss; + comReqss = (commRequests_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(commRequests_t*)); + for (int i = 0; i < mxLeafNode; ++i) + { + /* code */ + comReqss[i] = (commRequests_t* ) SUPERLU_MALLOC(sizeof(commRequests_t)); + initCommRequests(comReqss[i], grid); + }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/ + return comReqss; +} + +// sherry added +int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss) +{ + for (int i = 0; i < mxLeafNode; ++i) { + SUPERLU_FREE(comReqss[i]->U_diag_blk_recv_req); + SUPERLU_FREE(comReqss[i]->L_diag_blk_recv_req); + SUPERLU_FREE(comReqss[i]->U_diag_blk_send_req); + SUPERLU_FREE(comReqss[i]->L_diag_blk_send_req); + SUPERLU_FREE(comReqss[i]->send_req); + SUPERLU_FREE(comReqss[i]->recv_req); + SUPERLU_FREE(comReqss[i]->send_requ); + SUPERLU_FREE(comReqss[i]->recv_requ); + SUPERLU_FREE(comReqss[i]); + } + SUPERLU_FREE(comReqss); + return 0; +} + +int_t initFactStat(int_t nsupers, factStat_t* factStat) +{ + factStat->IrecvPlcd_D = intMalloc_dist( nsupers); + factStat->factored_D = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored_L = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored_U = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->IbcastPanel_L = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + factStat->IbcastPanel_U = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + factStat->gpuLUreduced = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + factStat->IrecvPlcd_D[i] = 0; + factStat->factored_D[i] = 0; + factStat->factored_L[i] = 0; + factStat->factored_U[i] = 0; + factStat->IbcastPanel_L[i] = 0; + factStat->IbcastPanel_U[i] = 0; + factStat->gpuLUreduced[i] = 0; + } + return 0; +} + +int freeFactStat(factStat_t* factStat) +{ + SUPERLU_FREE(factStat->IrecvPlcd_D); + SUPERLU_FREE(factStat->factored_D); + SUPERLU_FREE(factStat->factored_L); + SUPERLU_FREE(factStat->factored_U); + SUPERLU_FREE(factStat->factored); + SUPERLU_FREE(factStat->IbcastPanel_L); + SUPERLU_FREE(factStat->IbcastPanel_U); + SUPERLU_FREE(factStat->gpuLUreduced); + return 0; +} + +int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, + factNodelists_t* fNlists) +{ + fNlists->iperm_u = INT_T_ALLOC(nsupers); + fNlists->perm_u = INT_T_ALLOC(nsupers); +#if 0 // Sherry: change to int type + fNlists->indirect = INT_T_ALLOC(num_threads * ldt); + fNlists->indirect2 = INT_T_ALLOC(num_threads * ldt); +#else + fNlists->indirect = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int)); + fNlists->indirect2 = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int)); +#endif + return 0; +} + +int freeFactNodelists(factNodelists_t* fNlists) +{ + SUPERLU_FREE(fNlists->iperm_u); + SUPERLU_FREE(fNlists->perm_u); + SUPERLU_FREE(fNlists->indirect); + SUPERLU_FREE(fNlists->indirect2); + return 0; +} + +int_t initMsgs(msgs_t* msgs) +{ + msgs->msgcnt = (int *) SUPERLU_MALLOC(4 * sizeof(int)); + msgs->msgcntU = (int *) SUPERLU_MALLOC(4 * sizeof(int)); + return 0; +} + +msgs_t** initMsgsArr(int_t numLA) +{ + msgs_t**msgss = (msgs_t**) SUPERLU_MALLOC(numLA * sizeof(msgs_t*)); + for (int_t i = 0; i < numLA; ++i) + { + /* code */ + msgss[i] = (msgs_t*) SUPERLU_MALLOC(sizeof(msgs_t)); + initMsgs(msgss[i]); + } /*minor for loop-3 for (int i = 0; i < numLA; ++i)*/ + return msgss; +} + +// sherry added +int freeMsgsArr(int_t numLA, msgs_t **msgss) +{ + for (int i = 0; i < numLA; ++i) { + SUPERLU_FREE(msgss[i]->msgcnt); + SUPERLU_FREE(msgss[i]->msgcntU); + SUPERLU_FREE(msgss[i]); + } + SUPERLU_FREE(msgss); + return 0; +} + +int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo) +{ + packLUInfo->Ublock_info = (Ublock_info_t*) SUPERLU_MALLOC (sizeof(Ublock_info_t) * nsupers); + packLUInfo->Remain_info = (Remain_info_t* ) SUPERLU_MALLOC(sizeof(Remain_info_t) * nsupers); + packLUInfo->uPanelInfo = (uPanelInfo_t* ) SUPERLU_MALLOC(sizeof(uPanelInfo_t)); + packLUInfo->lPanelInfo = (lPanelInfo_t*) SUPERLU_MALLOC(sizeof(lPanelInfo_t)); + return 0; +} + +int freePackLUInfo(packLUInfo_t* packLUInfo) // sherry added +{ + SUPERLU_FREE(packLUInfo->Ublock_info); + SUPERLU_FREE(packLUInfo->Remain_info); + SUPERLU_FREE(packLUInfo->uPanelInfo); + SUPERLU_FREE(packLUInfo->lPanelInfo); + return 0; +} + +int_t getNumLookAhead(superlu_dist_options_t *options) +{ + int_t numLA; + if (getenv("NLULA")) + { + numLA = atoi(getenv("NLULA")); + } + else + { + // printf("NLULA not set using default 2\n"); + // numLA = 2; + numLA = options->num_lookaheads; + } + return numLA; +} + +int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT) +{ + + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + int_t kcol = PCOL (k, grid); + + if (mycol == kcol && iam != pkk) + { + int_t flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + return flag; + } + + return 1; +} + +int_t checkRecvLDiag(int_t k, + commRequests_t *comReqs, + gridinfo_t *grid, + SCT_t *SCT) +{ + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + + /*factor the U panel*/ + if (myrow == krow && iam != pkk) + { + int_t flag = 0; + + flag = Test_LDiagBlock_Recv( L_diag_blk_recv_req , SCT); + + return flag; + } + return 1; +} + + diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c new file mode 100644 index 00000000..066edcab --- /dev/null +++ b/SRC/treeFactorizationGPU.c @@ -0,0 +1,716 @@ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "dlustruct_gpu.h" + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} + +int dsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + dLUValSubBuf_t **LUvsbs, // size = num Look ahead + diagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int_t ldt, + sluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams + + if (superlu_acc_offload) + syncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + initD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + sendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + reduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int_t k_st = eTreeTopLims[topoLvl]; + int_t k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + initD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + sendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + reduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = dSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t *indirect = fNlists->indirect; + int_t *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + double *bigV = scuBufs->bigV; + double *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#pragma omp parallel /* Look-ahead update on CPU */ + { + int_t thread_id = omp_get_thread_num(); + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int_t offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + initD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + sendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + reduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#pragma omp parallel + { + /* Master thread performs Schur complement update on GPU. */ +#pragma omp master + { + if (superlu_acc_offload) + { + int_t thread_id = omp_get_thread_num(); + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int_t streamId = k0 % nGPUStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + waitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + sendSCUdataHost2GPU(streamId, lsub, usub, bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + SchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#pragma omp for + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int_t kx_st = eTreeTopLims[topoLvl + 1]; + int_t kx_end = eTreeTopLims[topoLvl + 2]; + for (int_t k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int_t kx = perm_c_supno[k0x]; + int_t offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end sparseTreeFactor_ASYNC_GPU */ diff --git a/SRC/trfAux.c b/SRC/trfAux.c new file mode 100644 index 00000000..ff96d1b3 --- /dev/null +++ b/SRC/trfAux.c @@ -0,0 +1,1280 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Auxiliary routines to support 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + +#include "superlu_ddefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfAux.h" +#endif + + +int_t getslu25D_enabled() +{ + if ( getenv("SLU25D") != NULL) + { + return atoi(getenv("SLU25D")); + } + else + { + return 0; + } +} + +int getNsupers(int n, Glu_persist_t *Glu_persist) +{ + int nsupers = Glu_persist->supno[n - 1] + 1; + return nsupers; +} + +int set_tag_ub() +{ + void *attr_val; + int flag; + MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag); + if (!flag) + { + fprintf (stderr, "Could not get TAG_UB\n"); + exit(-1); + } + return ( *(int_t *) attr_val ); +} + +int getNumThreads(int iam) +{ + int num_threads = 1; +#ifdef _OPENMP + #pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + + } + } +#endif + + if (!iam) + { + printf(".. Starting with %d openMP threads \n", num_threads ); + + } + return num_threads; +} + + +#if 0 //**** Sherry: following two routines are old, the new ones are in util.c +int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u) +{ + int_t lk = LBi (kk, grid); + int_t *usub = Ufstnz_br_ptr[lk]; + + if (usub == NULL) + { + /* code */ + return 0; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + + int_t klst = FstBlockC (kk + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t temp_ncols = 0; + int_t segsize; + + temp_ncols = 0; + + for (int_t j = 0; j < nub; ++j) + { + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + } + return temp_ncols; +} + +// Sherry: this is old; new version is in util.c +int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u) +{ + + int_t iam = grid->iam; + + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + + int_t* xsup = Glu_persist->xsup; + + int ncols = 0; + int_t ldu = 0; + + /*initilize perm_u*/ + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + } + + for (int lk = myrow; lk < nsupers; lk += Pr ) + { + ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, + xsup, grid, perm_u, &ldu)); + } + + int_t max_ncols = 0; + + MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); + + printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols); + return ldt * max_ncols; +} /* old estimate_bigu_size. New one is in util.c */ +#endif /**** end old ones ****/ + +int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr) +//LUstruct_t *LUstruct) +{ + + int_t Pr = grid->nprow; + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + + /* Following circuit is for finding maximum block size */ + int local_max_row_size = 0; + int max_row_size; + + for (int_t i = 0; i < nsupers; ++i) + { + int_t tpc = PCOL (i, grid); + if (mycol == tpc) + { + int_t lk = LBj (i, grid); + //int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk]; + int_t* lsub = Lrowind_bc_ptr[lk]; + if (lsub != NULL) + { + local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]); + } + } + + } + + /* Max row size is global reduction of within A row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + + // int_t Threads_per_process = get_thread_per_process (); + + /*Buffer size is max of of look ahead window*/ + + + int_t bigu_size = + 8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1); + + return bigu_size; +} + +int_t* getFactPerm(int_t nsupers) +{ + int_t* perm = INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + perm[i] = i; + } + + return perm; +} + +int_t* getFactIperm(int_t* perm, int_t nsupers) +{ + int_t* iperm = INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + iperm[perm[i]] = i; + } + + return iperm; +} + +int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *options, + int_t *etree, Glu_persist_t *Glu_persist, + int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, + gridinfo_t *grid) + +{ + /*I do not understand the following code in detail, + I have just written a wrapper around it*/ + + int_t* perm_c_supno; + //Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + //LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, *Lblock, *Lrows, + *sf_block, *sf_block_l, *nnodes_l, *nnodes_u, *edag_supno_l, *recvbuf, + **edag_supno; + int_t i, ib, jb, + lb, + nlb, il, iu; + int ncb, nrb, p, pr, pc, nblocks; + int_t *index; + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + int_t j, k, krow, yourcol; + etree_node *head, *tail, *ptr; + int *num_child; + nblocks = 0; + ncb = nsupers / Pc; + nrb = nsupers / Pr; + /* ================================================== * + * static scheduling of j-th step of LU-factorization * + * ================================================== */ + if ( options->lookahead_etree == YES && /* use e-tree of symmetrized matrix, and */ + (options->ParSymbFact == NO || /* 1) symmetric fact with serial symbolic, or */ + (options->SymPattern == YES && /* 2) symmetric pattern, and */ + options->RowPerm == NOROWPERM)) ) /* no rowperm to destroy the symmetry */ + { + /* if symmetric pattern or using e-tree of |A^T|+|A|, + then we can use a simple tree structure for static schduling */ + + if ( options->ParSymbFact == NO ) + { + /* Use the etree computed from serial symb. fact., and turn it + into supernodal tree. */ + //int_t *etree = LUstruct->etree; +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using column e-tree ===\n" ); +#endif + + /* look for the first off-diagonal blocks */ + etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) etree_supno[i] = nsupers; + for ( j = 0, lb = 0; lb < nsupers; lb++ ) + { + for ( k = 0; k < SuperSize(lb); k++ ) + { + jb = Glu_persist->supno[etree[j + k]]; + if ( jb != lb ) etree_supno[lb] = SUPERLU_MIN( etree_supno[lb], jb ); + } + j += SuperSize(lb); + } + } + else /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */ + { + /* Compute an "etree" based on struct(L), + assuming struct(U) = struct(L'). */ +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using supernodal e-tree ===\n" ); +#endif + + /* find the first block in each supernodal-column of local L-factor */ + etree_supno_l = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) etree_supno_l[i] = nsupers; + for ( lb = 0; lb < ncb; lb++ ) + { + jb = lb * grid->npcol + mycol; + //index = Llu->Lrowind_bc_ptr[lb]; + index = Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if ( i > 0 ) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i --; + } + + for ( j = 0; j < i; j++ ) + { + etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] ); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if ( mycol < nsupers % grid->npcol ) + { + jb = ncb * grid->npcol + mycol; + //index = Llu->Lrowind_bc_ptr[ncb]; + index = Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if ( i > 0 ) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i --; + } + for ( j = 0; j < i; j++ ) + { + etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] ); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + + /* form global e-tree */ + etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + MPI_Allreduce( etree_supno_l, etree_supno, nsupers, mpi_int_t, MPI_MIN, grid->comm ); + SUPERLU_FREE(etree_supno_l); + } + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) num_child[i] = 0; + for ( i = 0; i < nsupers; i++ ) if ( etree_supno[i] != nsupers ) num_child[etree_supno[i]] ++; + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for ( i = 0; i < nsupers; i++ ) + { + if ( num_child[i] == 0 ) + { + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/ + nnodes ++; + + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + perm_c_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + while ( nnodes > 0 ) + { + ptr = head; j = ptr->id; + head = ptr->next; + perm_c_supno[i] = j; + SUPERLU_FREE(ptr); + i++; nnodes --; + + if ( etree_supno[j] != nsupers ) + { + num_child[etree_supno[j]] --; + if ( num_child[etree_supno[j]] == 0 ) + { + nnodes ++; + + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = etree_supno[j]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id );*/ + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" );*/ + } + SUPERLU_FREE(num_child); + SUPERLU_FREE(etree_supno); + + } + else /* Unsymmetric pattern */ + { + /* Need to process both L- and U-factors, use the symmetrically + pruned graph of L & U instead of tree (very naive implementation) */ + int nrbp1 = nrb + 1; + + /* allocate some workspace */ + if ( !(sendcnts = SUPERLU_MALLOC( (4 + 2 * nrbp1) * Pr * Pc * sizeof(int))) ) + ABORT("Malloc fails for sendcnts[]."); + sdispls = &sendcnts[Pr * Pc]; + recvcnts = &sdispls [Pr * Pc]; + rdispls = &recvcnts[Pr * Pc]; + srows = &rdispls [Pr * Pc]; + rrows = &srows [Pr * Pc * nrbp1]; + + myrow = MYROW( iam, grid ); +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using DAG ===\n" ); +#endif + + /* send supno block of local U-factor to a processor * + * who owns the corresponding block of L-factor */ + + /* srows : # of block to send to a processor from each supno row */ + /* sendcnts: total # of blocks to send to a processor */ + for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0; + for (p = 0; p < Pr * Pc; p++ ) sendcnts[p] = 0; + + /* sending blocks of U-factors corresponding to L-factors */ + /* count the number of blocks to send */ + for (lb = 0; lb < nrb; ++lb) + { + jb = lb * Pr + myrow; + pc = jb % Pc; + //index = Llu->Ufstnz_br_ptr[lb]; + index = Ufstnz_br_ptr[lb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p] ++; + srows[p * nrbp1 + lb] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + if ( myrow < nsupers % grid->nprow ) + { + jb = nrb * Pr + myrow; + pc = jb % Pc; + //index = Llu->Ufstnz_br_ptr[nrb]; + index = Ufstnz_br_ptr[nrb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p] ++; + srows[p * nrbp1 + nrb] ++; + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + + /* insert blocks to send */ + sdispls[0] = 0; + for ( p = 1; p < Pr * Pc; p++ ) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + if ( !(blocks = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for blocks[]."); + for (lb = 0; lb < nrb; ++lb) + { + jb = lb * Pr + myrow; + pc = jb % Pc; + //index = Llu->Ufstnz_br_ptr[lb]; + index = Ufstnz_br_ptr[lb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + if ( myrow < nsupers % grid->nprow ) + { + jb = nrb * Pr + myrow; + pc = jb % Pc; + //index = Llu->Ufstnz_br_ptr[nrb]; + index = Ufstnz_br_ptr[nrb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + + /* communication */ + MPI_Alltoall( sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + MPI_Alltoall( srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm ); + + nblocks = recvcnts[0]; + rdispls[0] = sdispls[0] = 0; + for ( p = 1; p < Pr * Pc; p++ ) + { + rdispls[p] = rdispls[p - 1] + recvcnts[p - 1]; + sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + nblocks += recvcnts[p]; + } + + if ( !(blockr = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for blockr[]."); + MPI_Alltoallv( blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE( blocks ); + + /* store the received U-blocks by rows */ + nlb = nsupers / Pc; + if ( !(Ublock = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for Ublock[]."); + if ( !(Urows = intMalloc_dist( 1 + nlb )) ) ABORT("Malloc fails for Urows[]."); + k = 0; + for (jb = 0; jb < nlb; jb++ ) + { + j = jb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[jb] = 0; + + for ( pc = 0; pc < Pc; pc++ ) + { + p = pr * Pc + pc; /* the processor owning this block of U-factor */ + + for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) + { + Ublock[k] = blockr[i]; + k++; Urows[jb] ++; + } + rdispls[p] += rrows[p * nrbp1 + lb]; + } + /* sort by the column indices to make things easier for later on */ + +#ifdef ISORT + isort1( Urows[jb], &(Ublock[k - Urows[jb]]) ); +#else + qsort( &(Ublock[k - Urows[jb]]), (size_t)(Urows[jb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + if ( mycol < nsupers % grid->npcol ) + { + j = nlb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[nlb] = 0; + + for ( pc = 0; pc < Pc; pc++ ) + { + p = pr * Pc + pc; + for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) + { + Ublock[k] = blockr[i]; + k++; Urows[nlb] ++; + } + rdispls[p] += rrows[p * nrb + lb]; + } +#ifdef ISORT + isort1( Urows[nlb], &(Ublock[k - Urows[nlb]]) ); +#else + qsort( &(Ublock[k - Urows[nlb]]), (size_t)(Urows[nlb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + SUPERLU_FREE( blockr ); + + /* sort the block in L-factor */ + nblocks = 0; + for ( lb = 0; lb < ncb; lb++ ) + { + jb = lb * Pc + mycol; + //index = Llu->Lrowind_bc_ptr[lb]; + index = Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + nblocks += index[0]; + } + } + if ( mycol < nsupers % grid->npcol ) + { + jb = ncb * Pc + mycol; + //index = Llu->Lrowind_bc_ptr[ncb]; + index = Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + nblocks += index[0]; + } + } + + if ( !(Lblock = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for Lblock[]."); + if ( !(Lrows = intMalloc_dist( 1 + ncb )) ) ABORT("Malloc fails for Lrows[]."); + for ( lb = 0; lb <= ncb; lb++ ) Lrows[lb] = 0; + nblocks = 0; + for ( lb = 0; lb < ncb; lb++ ) + { + Lrows[lb] = 0; + + jb = lb * Pc + mycol; + //index = Llu->Lrowind_bc_ptr[lb]; + index = Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for ( j = 0; j < i; j++ ) + { + Lblock[nblocks] = index[k]; + Lrows[lb] ++; + nblocks++; + + k += LB_DESCRIPTOR + index[k + 1]; + } + } +#ifdef ISORT + isort1( Lrows[lb], &(Lblock[nblocks - Lrows[lb]]) ); +#else + qsort( &(Lblock[nblocks - Lrows[lb]]), (size_t)(Lrows[lb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + if ( mycol < nsupers % grid->npcol ) + { + Lrows[ncb] = 0; + jb = ncb * Pc + mycol; + //index = Llu->Lrowind_bc_ptr[ncb]; + index = Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for ( j = 0; j < i; j++ ) + { + Lblock[nblocks] = index[k]; + Lrows[ncb] ++; + nblocks++; + k += LB_DESCRIPTOR + index[k + 1]; + } +#ifdef ISORT + isort1( Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]) ); +#else + qsort( &(Lblock[nblocks - Lrows[ncb]]), (size_t)(Lrows[ncb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + } + + /* look for the first local symmetric nonzero block match */ + if ( !(sf_block = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for sf_block[]."); + if ( !(sf_block_l = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for sf_block_l[]."); + for ( lb = 0; lb < nsupers; lb++ ) sf_block_l[lb] = nsupers; + i = 0; j = 0; + for ( jb = 0; jb < nlb; jb++ ) + { + if ( Urows[jb] > 0 ) + { + ib = i + Urows[jb]; + lb = jb * Pc + mycol; + for ( k = 0; k < Lrows[jb]; k++ ) + { + while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++; + + if ( Ublock[i] == Lblock[j] ) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[jb] - k); + k = Lrows[jb]; + } + else + { + j++; + } + } + i = ib; + } + else + { + j += Lrows[jb]; + } + } + if ( mycol < nsupers % grid->npcol ) + { + if ( Urows[nlb] > 0 ) + { + ib = i + Urows[nlb]; + lb = nlb * Pc + mycol; + for ( k = 0; k < Lrows[nlb]; k++ ) + { + while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++; + + if ( Ublock[i] == Lblock[j] ) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[nlb] - k); + k = Lrows[nlb]; + } + else + { + j++; + } + } + i = ib; + } + else + { + j += Lrows[nlb]; + } + } + /* compute the first global symmetric matchs */ + MPI_Allreduce( sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, grid->comm ); + SUPERLU_FREE( sf_block_l ); + + /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */ + if ( !(nnodes_l = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for nnodes_l[]."); + if ( !(nnodes_u = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for nnodes_u[]."); + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_u[lb] = 0; + + nblocks = 0; + /* from U-factor */ + for (i = 0, jb = 0; jb < nlb; jb++ ) + { + lb = jb * Pc + mycol; + ib = i + Urows[jb]; + while ( i < ib ) + { + if ( Ublock[i] <= sf_block[lb] ) + { + nnodes_u[lb] ++; + i++; nblocks++; + } + else /* get out*/ + { + i = ib; + } + } + i = ib; + } + if ( mycol < nsupers % grid->npcol ) + { + lb = nlb * Pc + mycol; + ib = i + Urows[nlb]; + while ( i < ib ) + { + if ( Ublock[i] <= sf_block[lb] ) + { + nnodes_u[lb] ++; + i++; nblocks++; + } + else /* get out*/ + { + i = ib; + } + } + i = ib; + } + + /* from L-factor */ + for (i = 0, jb = 0; jb < nlb; jb++ ) + { + lb = jb * Pc + mycol; + ib = i + Lrows[jb]; + while ( i < ib ) + { + if ( Lblock[i] < sf_block[lb] ) + { + nnodes_l[lb] ++; + i++; nblocks++; + } + else + { + i = ib; + } + } + i = ib; + } + if ( mycol < nsupers % grid->npcol ) + { + lb = nlb * Pc + mycol; + ib = i + Lrows[nlb]; + while ( i < ib ) + { + if ( Lblock[i] < sf_block[lb] ) + { + nnodes_l[lb] ++; + i++; nblocks++; + } + else + { + i = ib; + } + } + i = ib; + } + +#ifdef USE_ALLGATHER + /* insert local nodes in DAG */ + if ( !(edag_supno_l = intMalloc_dist( nsupers + nblocks )) ) + ABORT("Malloc fails for edag_supno_l[]."); + iu = il = nblocks = 0; + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++; + if ( mycol == pc ) + { + /* from U-factor */ + ib = iu + Urows[j]; + for ( jb = 0; jb < nnodes_u[lb]; jb++ ) + { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for ( jb = 0; jb < nnodes_l[lb]; jb++ ) + { + edag_supno_l[nblocks] = Lblock[il]; + il++; nblocks++; + } + il = ib; + } + } + SUPERLU_FREE( nnodes_u ); + + /* form global DAG on each processor */ + MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for ( lb = 1; lb < Pc * Pr; lb++ ) + { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if ( !(recvbuf = intMalloc_dist( nblocks )) ) + ABORT("Malloc fails for recvbuf[]."); + MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE(edag_supno_l); + + if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) ) + ABORT("Malloc fails for edag_supno[]."); + k = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + for ( lb = 0; lb < nsupers; lb++ ) + { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + for ( lb = 0; lb < nsupers; lb++ ) + { + if ( nnodes_l[lb] > 0 ) + if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) ) + ABORT("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + for ( lb = 0; lb < nsupers; lb++ ) + { + jb = k + recvbuf[k] + 1; + k ++; + for ( ; k < jb; k++ ) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb] ++; + } + } + } + SUPERLU_FREE(recvbuf); +#else + int nlsupers = nsupers / Pc; + if ( mycol < nsupers % Pc ) nlsupers ++; + + /* insert local nodes in DAG */ + if ( !(edag_supno_l = intMalloc_dist( nlsupers + nblocks )) ) + ABORT("Malloc fails for edag_supno_l[]."); + iu = il = nblocks = 0; + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( mycol == pc ) + { + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++; + /* from U-factor */ + ib = iu + Urows[j]; + for ( jb = 0; jb < nnodes_u[lb]; jb++ ) + { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for ( jb = 0; jb < nnodes_l[lb]; jb++ ) + { + edag_supno_l[nblocks] = Lblock[il]; + il++; nblocks++; + } + il = ib; + } + else if ( nnodes_l[lb] + nnodes_u[lb] != 0 ) + printf( " # %d: nnodes[%d]=%d+%d\n", grid->iam, + (int) lb, (int) nnodes_l[lb], (int) nnodes_u[lb] ); + } + SUPERLU_FREE( nnodes_u ); + /* form global DAG on each processor */ + MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for ( lb = 1; lb < Pc * Pr; lb++ ) + { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if ( !(recvbuf = intMalloc_dist( nblocks )) ) + ABORT("Malloc fails for recvbuf[]."); + + MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE(edag_supno_l); + + if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) ) + ABORT("Malloc fails for edag_supno[]."); + k = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + yourcol = MYCOL( p, grid ); + + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( yourcol == pc ) + { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + } + for ( lb = 0; lb < nsupers; lb++ ) + { + if ( nnodes_l[lb] > 0 ) + if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) ) + ABORT("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + yourcol = MYCOL( p, grid ); + + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( yourcol == pc ) + { + jb = k + recvbuf[k] + 1; + k ++; + for ( ; k < jb; k++ ) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb] ++; + } + } + } + } + SUPERLU_FREE(recvbuf); +#endif + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) num_child[i] = 0; + for ( i = 0; i < nsupers; i++ ) + { + for ( jb = 0; jb < nnodes_l[i]; jb++ ) + { + num_child[edag_supno[i][jb]]++; + } + } + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for ( i = 0; i < nsupers; i++ ) + { + if ( num_child[i] == 0 ) + { + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/ + nnodes ++; + + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + perm_c_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + while ( nnodes > 0 ) + { + + /*printf( "=== pop %d (%d) ===\n",head->id,i );*/ + ptr = head; j = ptr->id; + head = ptr->next; + + perm_c_supno[i] = j; + SUPERLU_FREE(ptr); + i++; nnodes --; + + for ( jb = 0; jb < nnodes_l[j]; jb++ ) + { + num_child[edag_supno[j][jb]]--; + if ( num_child[edag_supno[j][jb]] == 0 ) + { + nnodes ++; + + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = edag_supno[j][jb]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id );*/ + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" );*/ + } + SUPERLU_FREE(num_child); + + for ( lb = 0; lb < nsupers; lb++ ) if ( nnodes_l[lb] > 0 ) SUPERLU_FREE(edag_supno[lb] ); + SUPERLU_FREE(edag_supno); + SUPERLU_FREE(nnodes_l); + SUPERLU_FREE(sendcnts); + SUPERLU_FREE(sf_block); + SUPERLU_FREE(Ublock); + SUPERLU_FREE(Urows); + SUPERLU_FREE(Lblock); + SUPERLU_FREE(Lrows); + } + /* ======================== * + * end of static scheduling * + * ======================== */ + + return perm_c_supno; +} /* getPerm_c_supno */ + + +int_t Trs2_InitUblock_info(int_t klst, int_t nb, + Ublock_info_t *Ublock_info, + int_t *usub, + Glu_persist_t *Glu_persist, SuperLUStat_t *stat ) +{ + int_t *xsup = Glu_persist->xsup; + int_t iukp, rukp; + iukp = BR_HEADER; + rukp = 0; + + for (int_t b = 0; b < nb; ++b) + { + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + + Ublock_info[b].iukp = iukp; + Ublock_info[b].rukp = rukp; + // Ublock_info[b].nsupc = nsupc; + + iukp += UB_DESCRIPTOR; + /* Sherry: can remove this loop for rukp + rukp += usub[iukp-1]; + */ + for (int_t j = 0; j < nsupc; ++j) + { + int_t segsize = klst - usub[iukp++]; + rukp += segsize; + stat->ops[FACT] += segsize * (segsize + 1); + } + } + return 0; +} + +void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup, + int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, + gridinfo3d_t * grid3d + ) +{ + gridinfo_t* grid = &(grid3d->grid2d); + //int_t** Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr; + //int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr; + //int_t* xsup = LUstruct->Glu_persist->xsup; + + int_t * perm_u = INT_T_ALLOC(nsupers); + int_t * mylsize = INT_T_ALLOC(nsupers); + int_t * myusize = INT_T_ALLOC(nsupers); + // int_t * maxlsize = INT_T_ALLOC(nsupers); + // int_t * maxusize = INT_T_ALLOC(nsupers); + int ldu; + + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + mylsize[i] = 0; + myusize[i] = 0; + } + + for (int_t k = 0; k < nsupers ; ++k) + { + treeList[k].scuWeight = 0.0; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t ldu; + + if (myrow == krow) + { + /* code */ + myusize[k] = num_full_cols_U(k, Ufstnz_br_ptr, xsup, grid, + perm_u, &ldu); + } + + if (mycol == kcol) + { + /* code */ + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + // double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + if (lsub) + { + /* code */ + mylsize[k] = lsub[1]; + } + } + } + + // int_t maxlsize = 0; + MPI_Allreduce( MPI_IN_PLACE, mylsize, nsupers, mpi_int_t, MPI_MAX, grid->comm ); + // int_t maxusize = 0; + MPI_Allreduce( MPI_IN_PLACE, myusize, nsupers, mpi_int_t, MPI_MAX, grid->comm ); + + for (int_t k = 0; k < nsupers ; ++k) + { + + treeList[k].scuWeight = 0.0; + int_t ksupc = SuperSize(k); + treeList[k].scuWeight = 1.0 * ksupc * mylsize[k] * myusize[k]; + } + + SUPERLU_FREE(mylsize); + SUPERLU_FREE(myusize); + SUPERLU_FREE(perm_u); + +} /* getSCUweight */ + diff --git a/SRC/util.c b/SRC/util.c index c70ee409..30175ee2 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -26,273 +26,65 @@ at the top-level directory. #include "superlu_ddefs.h" /*! \brief Deallocate the structure pointing to the actual storage of the matrix. */ -void -Destroy_SuperMatrix_Store_dist(SuperMatrix *A) +void Destroy_SuperMatrix_Store_dist(SuperMatrix *A) { - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(A->Store); } -void -Destroy_CompCol_Matrix_dist(SuperMatrix *A) +void Destroy_CompCol_Matrix_dist(SuperMatrix *A) { NCformat *Astore = A->Store; - SUPERLU_FREE( Astore->rowind ); - SUPERLU_FREE( Astore->colptr ); - if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); + SUPERLU_FREE(Astore->rowind); + SUPERLU_FREE(Astore->colptr); + if (Astore->nzval) + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(Astore); } -void -Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) +void Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) { NRformat_loc *Astore = A->Store; - SUPERLU_FREE( Astore->rowptr ); - SUPERLU_FREE( Astore->colind ); - SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); + SUPERLU_FREE(Astore->rowptr); + SUPERLU_FREE(Astore->colind); + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(Astore); } -void -Destroy_CompRow_Matrix_dist(SuperMatrix *A) +void Destroy_CompRow_Matrix_dist(SuperMatrix *A) { - SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); - SUPERLU_FREE( ((NRformat *)A->Store)->colind ); - SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); - SUPERLU_FREE( A->Store ); + SUPERLU_FREE(((NRformat *)A->Store)->rowptr); + SUPERLU_FREE(((NRformat *)A->Store)->colind); + SUPERLU_FREE(((NRformat *)A->Store)->nzval); + SUPERLU_FREE(A->Store); } -void -Destroy_SuperNode_Matrix_dist(SuperMatrix *A) +void Destroy_SuperNode_Matrix_dist(SuperMatrix *A) { - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); - SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(((SCformat *)A->Store)->rowind); + SUPERLU_FREE(((SCformat *)A->Store)->rowind_colptr); + SUPERLU_FREE(((SCformat *)A->Store)->nzval); + SUPERLU_FREE(((SCformat *)A->Store)->nzval_colptr); + SUPERLU_FREE(((SCformat *)A->Store)->col_to_sup); + SUPERLU_FREE(((SCformat *)A->Store)->sup_to_col); + SUPERLU_FREE(A->Store); } /*! \brief A is of type Stype==NCP */ -void -Destroy_CompCol_Permuted_dist(SuperMatrix *A) +void Destroy_CompCol_Permuted_dist(SuperMatrix *A) { - SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); - SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(((NCPformat *)A->Store)->colbeg); + SUPERLU_FREE(((NCPformat *)A->Store)->colend); + SUPERLU_FREE(A->Store); } /*! \brief A is of type Stype==DN */ -void -Destroy_Dense_Matrix_dist(SuperMatrix *A) +void Destroy_Dense_Matrix_dist(SuperMatrix *A) { - DNformat* Astore = A->Store; - SUPERLU_FREE (Astore->nzval); - SUPERLU_FREE ( A->Store ); -} - -#if 0 // moved to precision-dependent routines - -/*! \brief Destroy distributed L & U matrices. */ -void -Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) -{ - int_t i, nb, nsupers; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter Destroy_Tree()"); -#endif - - nsupers = Glu_persist->supno[n-1] + 1; - - nb = CEILING(nsupers, grid->npcol); - for (i=0;iLBtree_ptr[i].empty_==NO){ - // BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); - C_BcTree_Nullify(&Llu->LBtree_ptr[i]); - } - if(Llu->UBtree_ptr[i].empty_==NO){ - // BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); - C_BcTree_Nullify(&Llu->UBtree_ptr[i]); - } - } - SUPERLU_FREE(Llu->LBtree_ptr); - SUPERLU_FREE(Llu->UBtree_ptr); - - nb = CEILING(nsupers, grid->nprow); - for (i=0;iLRtree_ptr[i].empty_==NO){ - // RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); - C_RdTree_Nullify(&Llu->LRtree_ptr[i]); - } - if(Llu->URtree_ptr[i].empty_==NO){ - // RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); - C_RdTree_Nullify(&Llu->URtree_ptr[i]); - } - } - SUPERLU_FREE(Llu->LRtree_ptr); - SUPERLU_FREE(Llu->URtree_ptr); - - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit Destroy_Tree()"); -#endif + DNformat *Astore = A->Store; + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(A->Store); } -/*! \brief Destroy distributed L & U matrices. */ -void -Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) -{ - int_t i, nb, nsupers; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter Destroy_LU()"); -#endif - - Destroy_Tree(n, grid, LUstruct); - - nsupers = Glu_persist->supno[n-1] + 1; - - nb = CEILING(nsupers, grid->npcol); - // for (i = 0; i < nb; ++i) - // if ( Llu->Lrowind_bc_ptr[i] ) { - // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); - // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); - // } - SUPERLU_FREE (Llu->Lrowind_bc_ptr); - SUPERLU_FREE (Llu->Lrowind_bc_dat); - SUPERLU_FREE (Llu->Lrowind_bc_offset); - SUPERLU_FREE (Llu->Lnzval_bc_ptr); - SUPERLU_FREE (Llu->Lnzval_bc_dat); - SUPERLU_FREE (Llu->Lnzval_bc_offset); - - nb = CEILING(nsupers, grid->nprow); - for (i = 0; i < nb; ++i) - if ( Llu->Ufstnz_br_ptr[i] ) { - SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); - SUPERLU_FREE (Llu->Unzval_br_ptr[i]); - } - SUPERLU_FREE (Llu->Ufstnz_br_ptr); - SUPERLU_FREE (Llu->Unzval_br_ptr); - - /* The following can be freed after factorization. */ - SUPERLU_FREE(Llu->ToRecv); - SUPERLU_FREE(Llu->ToSendD); - SUPERLU_FREE(Llu->ToSendR[0]); - SUPERLU_FREE(Llu->ToSendR); - - /* The following can be freed only after iterative refinement. */ - SUPERLU_FREE(Llu->ilsum); - SUPERLU_FREE(Llu->fmod); - SUPERLU_FREE(Llu->fsendx_plist[0]); - SUPERLU_FREE(Llu->fsendx_plist); - SUPERLU_FREE(Llu->bmod); - SUPERLU_FREE(Llu->bsendx_plist[0]); - SUPERLU_FREE(Llu->bsendx_plist); - SUPERLU_FREE(Llu->mod_bit); - - // nb = CEILING(nsupers, grid->npcol); - // for (i = 0; i < nb; ++i) - // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) { - // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]); - // } - SUPERLU_FREE(Llu->Lindval_loc_bc_ptr); - SUPERLU_FREE(Llu->Lindval_loc_bc_dat); - SUPERLU_FREE(Llu->Lindval_loc_bc_offset); - - nb = CEILING(nsupers, grid->npcol); - for (i=0; iLinv_bc_ptr[i]!=NULL) { - // SUPERLU_FREE(Llu->Linv_bc_ptr[i]); - // } - - if(Llu->Uinv_bc_ptr[i]!=NULL){ - SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); - } - } - SUPERLU_FREE(Llu->Linv_bc_ptr); - SUPERLU_FREE(Llu->Linv_bc_dat); - SUPERLU_FREE(Llu->Linv_bc_offset); - SUPERLU_FREE(Llu->Uinv_bc_ptr); - SUPERLU_FREE(Llu->Unnz); - - nb = CEILING(nsupers, grid->npcol); - for (i = 0; i < nb; ++i) - if ( Llu->Urbs[i] ) { - SUPERLU_FREE(Llu->Ucb_indptr[i]); - SUPERLU_FREE(Llu->Ucb_valptr[i]); - } - SUPERLU_FREE(Llu->Ucb_indptr); - SUPERLU_FREE(Llu->Ucb_valptr); - SUPERLU_FREE(Llu->Urbs); - - SUPERLU_FREE(Glu_persist->xsup); - SUPERLU_FREE(Glu_persist->supno); - -#ifdef GPU_ACC - checkGPU (gpuFree (Llu->d_xsup)); - checkGPU (gpuFree (Llu->d_LRtree_ptr)); - checkGPU (gpuFree (Llu->d_LBtree_ptr)); - checkGPU (gpuFree (Llu->d_ilsum)); - checkGPU (gpuFree (Llu->d_Lrowind_bc_dat)); - checkGPU (gpuFree (Llu->d_Lrowind_bc_offset)); - checkGPU (gpuFree (Llu->d_Lnzval_bc_dat)); - checkGPU (gpuFree (Llu->d_Lnzval_bc_offset)); - checkGPU (gpuFree (Llu->d_Linv_bc_dat)); - checkGPU (gpuFree (Llu->d_Linv_bc_offset)); - checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat)); - checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset)); -#endif - - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit Destroy_LU()"); -#endif -} - -/*! \brief Allocate storage in LUstruct */ -void LUstructInit(const int_t n, LUstruct_t *LUstruct) -{ - if ( !(LUstruct->etree = intMalloc_dist(n)) ) - ABORT("Malloc fails for etree[]."); - if ( !(LUstruct->Glu_persist = (Glu_persist_t *) - SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) - ABORT("Malloc fails for Glu_persist_t."); - if ( !(LUstruct->Llu = (LocalLU_t *) - SUPERLU_MALLOC(sizeof(LocalLU_t))) ) - ABORT("Malloc fails for LocalLU_t."); - LUstruct->Llu->inv = 0; -} - -/*! \brief Deallocate LUstruct */ -void LUstructFree(LUstruct_t *LUstruct) -{ -#if ( DEBUGlevel>=1 ) - int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter LUstructFree()"); -#endif - - SUPERLU_FREE(LUstruct->etree); - SUPERLU_FREE(LUstruct->Glu_persist); - SUPERLU_FREE(LUstruct->Llu); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit LUstructFree()"); -#endif -} - -#endif // removed - - /*! \brief * *
@@ -300,57 +92,60 @@ void LUstructFree(LUstruct_t *LUstruct)
  * symmetrically reduced L. 
  * 
*/ -void -countnz_dist(const int_t n, int_t *xprune, - int_t *nnzL, int_t *nnzU, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) +void countnz_dist(const int_t n, int_t *xprune, + int_t *nnzL, int_t *nnzU, + Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) { - int_t fnz, fsupc, i, j, nsuper; - int_t jlen, irep; + int_t fnz, fsupc, i, j, nsuper; + int_t jlen, irep; long long int nnzL0; - int_t *supno, *xsup, *xlsub, *xusub, *usub; - - supno = Glu_persist->supno; - xsup = Glu_persist->xsup; - xlsub = Glu_freeable->xlsub; - xusub = Glu_freeable->xusub; - usub = Glu_freeable->usub; - *nnzL = 0; - *nnzU = 0; - nnzL0 = 0; + int_t *supno, *xsup, *xlsub, *xusub, *usub; + + supno = Glu_persist->supno; + xsup = Glu_persist->xsup; + xlsub = Glu_freeable->xlsub; + xusub = Glu_freeable->xusub; + usub = Glu_freeable->usub; + *nnzL = 0; + *nnzU = 0; + nnzL0 = 0; nsuper = supno[n]; - if ( n <= 0 ) return; + if (n <= 0) + return; /* * For each supernode in L. */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jlen = xlsub[fsupc+1] - xlsub[fsupc]; - - for (j = fsupc; j < xsup[i+1]; j++) { - *nnzL += jlen; - *nnzU += j - fsupc + 1; - jlen--; - } - irep = xsup[i+1] - 1; - nnzL0 += xprune[irep] - xlsub[irep]; + for (i = 0; i <= nsuper; i++) + { + fsupc = xsup[i]; + jlen = xlsub[fsupc + 1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i + 1]; j++) + { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i + 1] - 1; + nnzL0 += xprune[irep] - xlsub[irep]; } - + /* printf("\tNo of nonzeros in symm-reduced L = %ld\n", nnzL0);*/ - + /* For each column in U. */ - for (j = 0; j < n; ++j) { - for (i = xusub[j]; i < xusub[j+1]; ++i) { - fnz = usub[i]; - fsupc = xsup[supno[fnz]+1]; - *nnzU += fsupc - fnz; - } + for (j = 0; j < n; ++j) + { + for (i = xusub[j]; i < xusub[j + 1]; ++i) + { + fnz = usub[i]; + fsupc = xsup[supno[fnz] + 1]; + *nnzU += fsupc - fnz; + } } } - /*! \brief * *
@@ -360,36 +155,38 @@ countnz_dist(const int_t n, int_t *xprune,
  * 
*/ int64_t -fixupL_dist(const int_t n, const int_t *perm_r, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) +fixupL_dist(const int_t n, const int_t *perm_r, + Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) { register int_t nsuper, fsupc, nextl, i, j, k, jstrt; register long long int lsub_size; - int_t *xsup, *lsub, *xlsub; + int_t *xsup, *lsub, *xlsub; - if ( n <= 1 ) return 0; + if (n <= 1) + return 0; - xsup = Glu_persist->xsup; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - nextl = 0; + xsup = Glu_persist->xsup; + lsub = Glu_freeable->lsub; + xlsub = Glu_freeable->xlsub; + nextl = 0; nsuper = (Glu_persist->supno)[n]; lsub_size = xlsub[n]; - + /* * For each supernode ... */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jstrt = xlsub[fsupc]; - xlsub[fsupc] = nextl; - for (j = jstrt; j < xlsub[fsupc+1]; j++) { - lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ - nextl++; - } - for (k = fsupc+1; k < xsup[i+1]; k++) - xlsub[k] = nextl; /* Other columns in supernode i */ - + for (i = 0; i <= nsuper; i++) + { + fsupc = xsup[i]; + jstrt = xlsub[fsupc]; + xlsub[fsupc] = nextl; + for (j = jstrt; j < xlsub[fsupc + 1]; j++) + { + lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ + nextl++; + } + for (k = fsupc + 1; k < xsup[i + 1]; k++) + xlsub[k] = nextl; /* Other columns in supernode i */ } xlsub[n] = nextl; @@ -400,28 +197,29 @@ fixupL_dist(const int_t n, const int_t *perm_r, */ void set_default_options_dist(superlu_dist_options_t *options) { - options->Fact = DOFACT; - options->Equil = YES; - options->ParSymbFact = NO; + options->Fact = DOFACT; + options->Equil = YES; + options->ParSymbFact = NO; #ifdef HAVE_PARMETIS - options->ColPerm = METIS_AT_PLUS_A; + options->ColPerm = METIS_AT_PLUS_A; #else - options->ColPerm = MMD_AT_PLUS_A; + options->ColPerm = MMD_AT_PLUS_A; #endif - options->RowPerm = LargeDiag_MC64; - options->ReplaceTinyPivot = NO; - options->IterRefine = SLU_DOUBLE; - options->Trans = NOTRANS; - options->SolveInitialized = NO; + options->RowPerm = LargeDiag_MC64; + options->ReplaceTinyPivot = NO; + options->IterRefine = SLU_DOUBLE; + options->Trans = NOTRANS; + options->SolveInitialized = NO; options->RefineInitialized = NO; - options->PrintStat = YES; - options->num_lookaheads = 10; - options->lookahead_etree = NO; - options->SymPattern = NO; + options->PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options->Algo3d = NO; #ifdef SLU_HAVE_LAPACK - options->DiagInv = YES; + options->DiagInv = YES; #else - options->DiagInv = NO; + options->DiagInv = NO; #endif } @@ -429,7 +227,8 @@ void set_default_options_dist(superlu_dist_options_t *options) */ void print_options_dist(superlu_dist_options_t *options) { - if ( options->PrintStat == NO ) return; + if (options->PrintStat == NO) + return; printf("**************************************************\n"); printf(".. options:\n"); @@ -452,14 +251,15 @@ void print_options_dist(superlu_dist_options_t *options) */ void print_sp_ienv_dist(superlu_dist_options_t *options) { - if ( options->PrintStat == NO ) return; + if (options->PrintStat == NO) + return; printf("**************************************************\n"); printf(".. blocking parameters from sp_ienv():\n"); - printf("** relaxation : " IFMT "\n", sp_ienv_dist(2)); - printf("** max supernode : " IFMT "\n", sp_ienv_dist(3)); - printf("** estimated fill ratio : " IFMT "\n", sp_ienv_dist(6)); - printf("** min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7)); + printf("** relaxation : %d\n", sp_ienv_dist(2)); + printf("** max supernode : %d\n", sp_ienv_dist(3)); + printf("** estimated fill ratio : %d\n", sp_ienv_dist(6)); + printf("** min GEMM m*k*n to use GPU : %d\n", sp_ienv_dist(7)); printf("**************************************************\n"); } @@ -471,33 +271,32 @@ void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) SUPERLU_FREE(gstrs_comm); } - /*! \brief Diagnostic print of segment info after panel_dfs(). */ -void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, - int_t *segrep, int_t *repfnz) +void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, + int_t *segrep, int_t *repfnz) { int_t j, k; - - for (j = jcol; j < jcol+w; j++) { - printf("\tcol " IFMT ":\n", j); - for (k = 0; k < nseg; k++) - printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, - segrep[k], repfnz[(j-jcol)*n + segrep[k]]); - } + for (j = jcol; j < jcol + w; j++) + { + printf("\tcol " IFMT ":\n", j); + for (k = 0; k < nseg; k++) + printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, + segrep[k], repfnz[(j - jcol) * n + segrep[k]]); + } } -void -PStatInit(SuperLUStat_t *stat) +void PStatInit(SuperLUStat_t *stat) { register int_t i; - if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) ) - ABORT("Malloc fails for stat->utime[]"); - if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) ) - ABORT("SUPERLU_MALLOC fails for stat->ops[]"); - for (i = 0; i < NPHASES; ++i) { + if (!(stat->utime = SUPERLU_MALLOC(NPHASES * sizeof(double)))) + ABORT("Malloc fails for stat->utime[]"); + if (!(stat->ops = (flops_t *)SUPERLU_MALLOC(NPHASES * sizeof(flops_t)))) + ABORT("SUPERLU_MALLOC fails for stat->ops[]"); + for (i = 0; i < NPHASES; ++i) + { stat->utime[i] = 0.; stat->ops[i] = 0.; } @@ -506,168 +305,179 @@ PStatInit(SuperLUStat_t *stat) stat->gpu_buffer = 0.0; } -void -PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) +void PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) { - double *utime = stat->utime; + double *utime = stat->utime; flops_t *ops = stat->ops; - int_t iam = grid->iam; + int_t iam = grid->iam; flops_t flopcnt, factflop, solveflop; - if ( options->PrintStat == NO ) return; - - if ( !iam && options->Fact != FACTORED ) { - printf("**************************************************\n"); - printf("**** Time (seconds) ****\n"); + if (options->PrintStat == NO) + return; + if (!iam && options->Fact != FACTORED) + { + printf("**************************************************\n"); + printf("**** Time (seconds) ****\n"); if ( options->Equil != NO ) - printf("\tEQUIL time %8.2f\n", utime[EQUIL]); + printf("\tEQUIL time %8.3f\n", utime[EQUIL]); if ( options->RowPerm != NOROWPERM ) - printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); + printf("\tROWPERM time %8.3f\n", utime[ROWPERM]); if ( options->ColPerm != NATURAL ) - printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); - printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); - printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); - + printf("\tCOLPERM time %8.3f\n", utime[COLPERM]); + printf("\tSYMBFACT time %8.3f\n", utime[SYMBFAC]); + printf("\tDISTRIBUTE time %8.3f\n", utime[DIST]); } MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); + 0, grid->comm); factflop = flopcnt; if ( !iam && options->Fact != FACTORED ) { - printf("\tFACTOR time %8.2f\n", utime[FACT]); + printf("\tFACTOR time %8.3f\n", utime[FACT]); if ( utime[FACT] != 0.0 ) printf("\tFactor flops\t%e\tMflops \t%8.2f\n", flopcnt, flopcnt*1e-6/utime[FACT]); } - - MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); + + MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, + 0, grid->comm); solveflop = flopcnt; - if ( !iam ) { - printf("\tSOLVE time %8.3f\n", utime[SOLVE]); - if ( utime[SOLVE] != 0.0 ) - printf("\tSolve flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt*1e-6/utime[SOLVE]); - if ( options->IterRefine != NOREFINE ) { - printf("\tREFINEMENT time %8.3f\tSteps%8d\n\n", - utime[REFINE], stat->RefineSteps); - } - printf("**************************************************\n"); + if (!iam) + { + printf("\tSOLVE time %8.3f\n", utime[SOLVE]); + if (utime[SOLVE] != 0.0) + printf("\tSolve flops\t%e\tMflops \t%8.2f\n", + flopcnt, + flopcnt * 1e-6 / utime[SOLVE]); + if (options->IterRefine != NOREFINE) + { + printf("\tREFINEMENT time %8.3f\tSteps%8d\n\n", + utime[REFINE], stat->RefineSteps); + } + printf("**************************************************\n"); } - double *utime1,*utime2,*utime3,*utime4; - flops_t *ops1; -#if ( PROFlevel>=1 ) - fflush(stdout); - MPI_Barrier( grid->comm ); + double *utime1, *utime2, *utime3, *utime4; + flops_t *ops1; +#if (PROFlevel >= 1) + fflush(stdout); + MPI_Barrier(grid->comm); { - int_t i, P = grid->nprow*grid->npcol; - flops_t b, maxflop; - - - if ( !iam )utime1=doubleMalloc_dist(P); - if ( !iam )utime2=doubleMalloc_dist(P); - if ( !iam )utime3=doubleMalloc_dist(P); - if ( !iam )utime4=doubleMalloc_dist(P); - if ( !iam )ops1=(flops_t *) SUPERLU_MALLOC(P * sizeof(flops_t)); + int_t i, P = grid->nprow * grid->npcol; + flops_t b, maxflop; + + if (!iam) + utime1 = doubleMalloc_dist(P); + if (!iam) + utime2 = doubleMalloc_dist(P); + if (!iam) + utime3 = doubleMalloc_dist(P); + if (!iam) + utime4 = doubleMalloc_dist(P); + if (!iam) + ops1 = (flops_t *)SUPERLU_MALLOC(P * sizeof(flops_t)); + + // fflush(stdout); + // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n"); + // fflush(stdout); + // sleep(2.0); + // MPI_Barrier( grid->comm ); + // for (i = 0; i < P; ++i) { + // if ( iam == i) { + // printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees); + // fflush(stdout); + // } + // MPI_Barrier( grid->comm ); + // } + + // sleep(2.0); + + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); + + MPI_Gather(&utime[COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[FACT], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); - - // fflush(stdout); - // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n"); - // fflush(stdout); - // sleep(2.0); - // MPI_Barrier( grid->comm ); - // for (i = 0; i < P; ++i) { - // if ( iam == i) { - // printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees); - // fflush(stdout); - // } - // MPI_Barrier( grid->comm ); - // } - - // sleep(2.0); + if (!iam) + printf("\n.. FACT ops distribution:\n"); + MPI_Gather(&ops[FACT], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm); - - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t(%d)\t%e\n", i, ops1[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); - MPI_Gather(&utime[COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[FACT], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. FACT ops distribution:\n"); - MPI_Gather(&ops[FACT], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); - - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t(%d)\t%e\n", i, ops1[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - if ( !iam ) { - b = factflop/P/maxflop; - printf("\tFACT load balance: %.2f\n", b); - } - fflush(stdout); - MPI_Barrier( grid->comm ); + if (!iam) + { + b = factflop / P / maxflop; + printf("\tFACT load balance: %.2f\n", b); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n"); + + MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE, utime3, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE, utime4, 1, MPI_DOUBLE, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i, utime1[i], utime2[i], utime3[i], utime4[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. SOLVE ops distribution:\n"); + MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t%d\t%e\n", i, ops1[i]); + } + MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if (!iam) + { + b = solveflop / P / maxflop; + printf("\tSOLVE load balance: %.2f\n", b); + fflush(stdout); + } + } - - if ( !iam ) printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n"); - - MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE,utime3, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE,utime4, 1 , MPI_DOUBLE, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i,utime1[i],utime2[i],utime3[i], utime4[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); - MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t%d\t%e\n", i, ops1[i]); - } - MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); - if ( !iam ) { - b = solveflop/P/maxflop; - printf("\tSOLVE load balance: %.2f\n", b); - fflush(stdout); - } - + if (!iam) + { + SUPERLU_FREE(utime1); + SUPERLU_FREE(utime2); + SUPERLU_FREE(utime3); + SUPERLU_FREE(utime4); + SUPERLU_FREE(ops1); } - - if ( !iam ){ - SUPERLU_FREE(utime1); - SUPERLU_FREE(utime2); - SUPERLU_FREE(utime3); - SUPERLU_FREE(utime4); - SUPERLU_FREE(ops1); - } - + #endif -/* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ + /* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ } -void -PStatFree(SuperLUStat_t *stat) +void PStatFree(SuperLUStat_t *stat) { SUPERLU_FREE(stat->utime); SUPERLU_FREE(stat->ops); @@ -678,13 +488,12 @@ PStatFree(SuperLUStat_t *stat) void ifill_dist(int_t *a, int_t alen, int_t ival) { register int_t i; - for (i = 0; i < alen; i++) a[i] = ival; + for (i = 0; i < alen; i++) + a[i] = ival; } - -void -get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) +void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, + int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) { int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; int_t *xsup; @@ -692,99 +501,112 @@ get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, i = j = *num_diag_procs = pkk = 0; nprow = grid->nprow; npcol = grid->npcol; - nsupers = Glu_persist->supno[n-1] + 1; + nsupers = Glu_persist->supno[n - 1] + 1; xsup = Glu_persist->xsup; - do { - ++(*num_diag_procs); - i = (++i) % nprow; - j = (++j) % npcol; - pkk = PNUM( i, j, grid ); - } while ( pkk != 0 ); /* Until wrap back to process 0 */ - if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) - ABORT("Malloc fails for diag_procs[]"); - if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) - ABORT("Calloc fails for diag_len[]"); - for (i = j = k = 0; k < *num_diag_procs; ++k) { - pkk = PNUM( i, j, grid ); - (*diag_procs)[k] = pkk; - i = (++i) % nprow; - j = (++j) % npcol; + do + { + ++(*num_diag_procs); + ++i; + i = (i) % nprow; + ++j; + j = (j) % npcol; + pkk = PNUM(i, j, grid); + } while (pkk != 0); /* Until wrap back to process 0 */ + if (!(*diag_procs = intMalloc_dist(*num_diag_procs))) + ABORT("Malloc fails for diag_procs[]"); + if (!(*diag_len = intCalloc_dist(*num_diag_procs))) + ABORT("Calloc fails for diag_len[]"); + for (i = j = k = 0; k < *num_diag_procs; ++k) + { + pkk = PNUM(i, j, grid); + (*diag_procs)[k] = pkk; + ++i; + i = (i) % nprow; + ++j; + j = (j) % npcol; } - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - i = k % *num_diag_procs; - (*diag_len)[i] += knsupc; + for (k = 0; k < nsupers; ++k) + { + knsupc = SuperSize(k); + i = k % *num_diag_procs; + (*diag_len)[i] += knsupc; } } - /*! \brief Get the statistics of the supernodes */ #define NBUCKS 10 -static int_t max_sup_size; +static int_t max_sup_size; void super_stats_dist(int_t nsuper, int_t *xsup) { register int_t nsup1 = 0; - int_t i, isize, whichb, bl, bh; - int_t bucket[NBUCKS]; + int_t i, isize, whichb, bl, bh; + int_t bucket[NBUCKS]; max_sup_size = 0; - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - if ( isize == 1 ) nsup1++; - if ( max_sup_size < isize ) max_sup_size = isize; + for (i = 0; i <= nsuper; i++) + { + isize = xsup[i + 1] - xsup[i]; + if (isize == 1) + nsup1++; + if (max_sup_size < isize) + max_sup_size = isize; } - printf(" Supernode statistics:\n\tno of super = " IFMT "\n", nsuper+1); + printf(" Supernode statistics:\n\tno of super = " IFMT "\n", nsuper + 1); printf("\tmax supernode size = " IFMT "\n", max_sup_size); printf("\tno of size 1 supernodes = " IFMT "\n", nsup1); /* Histogram of the supernode sizes */ - ifill_dist (bucket, NBUCKS, 0); + ifill_dist(bucket, NBUCKS, 0); - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - whichb = (float) isize / max_sup_size * NBUCKS; - if (whichb >= NBUCKS) whichb = NBUCKS - 1; + for (i = 0; i <= nsuper; i++) + { + isize = xsup[i + 1] - xsup[i]; + whichb = (float)isize / max_sup_size * NBUCKS; + if (whichb >= NBUCKS) + whichb = NBUCKS - 1; bucket[whichb]++; } - + printf("\tHistogram of supernode sizes:\n"); - for (i = 0; i < NBUCKS; i++) { - bl = (float) i * max_sup_size / NBUCKS; - bh = (float) (i+1) * max_sup_size / NBUCKS; - printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl+1, bh, bucket[i]); + for (i = 0; i < NBUCKS; i++) + { + bl = (float)i * max_sup_size / NBUCKS; + bh = (float)(i + 1) * max_sup_size / NBUCKS; + printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl + 1, bh, bucket[i]); } - } - /*! \brief Check whether repfnz[] == EMPTY after reset. */ void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz) { int_t jj, k; - for (jj = jcol; jj < jcol+w; jj++) - for (k = 0; k < n; k++) - if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { - fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n", - jj, k, repfnz[(jj-jcol)*n + k]); - ABORT("check_repfnz_dist"); - } + for (jj = jcol; jj < jcol + w; jj++) + for (k = 0; k < n; k++) + if (repfnz[(jj - jcol) * n + k] != EMPTY) + { + fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n", + jj, k, repfnz[(jj - jcol) * n + k]); + ABORT("check_repfnz_dist"); + } } void PrintInt10(char *name, int_t len, int_t *x) { register int_t i; - + printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) printf("\n\t[" IFMT "-" IFMT "]", i, i+9); - printf(IFMT, x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + printf("\n\t[" IFMT "-" IFMT "]", i, i + 9); + printf(IFMT, x[i]); } printf("\n"); } @@ -792,11 +614,13 @@ void PrintInt10(char *name, int_t len, int_t *x) void PrintInt32(char *name, int len, int *x) { register int i; - + printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9); - printf("%6d", x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + printf("\n\t[%2d-%2d]", i, i + 9); + printf("%6d", x[i]); } printf("\n"); } @@ -804,11 +628,13 @@ void PrintInt32(char *name, int len, int *x) int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) { register int_t i; - + fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i+9); - fprintf(fp, IFMT, x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i + 9); + fprintf(fp, IFMT, x[i]); } fprintf(fp, "\n"); return 0; @@ -817,43 +643,50 @@ int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) int file_PrintInt32(FILE *fp, char *name, int len, int *x) { register int i; - + fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9); - fprintf(fp, "%6d", x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + fprintf(fp, "\n\t[%2d-%2d]", i, i + 9); + fprintf(fp, "%6d", x[i]); } fprintf(fp, "\n"); return 0; } -int_t -CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) +int_t CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) { register int_t i, j, zd, numzd = 0; - for (j = 0; j < n; ++j) { - zd = 0; - for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) { - /*if ( iperm[rowind[i]] == j ) zd = 1;*/ - if ( rowind[i] == j ) { zd = 1; break; } - } - if ( zd == 0 ) { -#if ( PRNTlevel>=2 ) - printf(".. Diagonal of column %d is zero.\n", j); + for (j = 0; j < n; ++j) + { + zd = 0; + for (i = colbeg[j]; i < colbeg[j] + colcnt[j]; ++i) + { + /*if ( iperm[rowind[i]] == j ) zd = 1;*/ + if (rowind[i] == j) + { + zd = 1; + break; + } + } + if (zd == 0) + { +#if (PRNTlevel >= 2) + printf(".. Diagonal of column %d is zero.\n", j); #endif - ++numzd; - } + ++numzd; + } } return numzd; } - /* --------------------------------------------------------------------------- */ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2) { -/* + /* * Purpose * ======= * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing @@ -872,34 +705,39 @@ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2) * On entry, contains the array to be sorted. * On exit, contains the sorted array. */ - int_t IGAP, I, J; - int_t TEMP; - IGAP = N / 2; - while (IGAP > 0) { - for (I = IGAP; I < N; I++) { - J = I - IGAP; - while (J >= 0) { - if (ARRAY1[J] > ARRAY1[J + IGAP]) { - TEMP = ARRAY1[J]; - ARRAY1[J] = ARRAY1[J + IGAP]; - ARRAY1[J + IGAP] = TEMP; - TEMP = ARRAY2[J]; - ARRAY2[J] = ARRAY2[J + IGAP]; - ARRAY2[J + IGAP] = TEMP; - J = J - IGAP; - } else { - break; - } - } - } - IGAP = IGAP / 2; - } + int_t IGAP, I, J; + int_t TEMP; + IGAP = N / 2; + while (IGAP > 0) + { + for (I = IGAP; I < N; I++) + { + J = I - IGAP; + while (J >= 0) + { + if (ARRAY1[J] > ARRAY1[J + IGAP]) + { + TEMP = ARRAY1[J]; + ARRAY1[J] = ARRAY1[J + IGAP]; + ARRAY1[J + IGAP] = TEMP; + TEMP = ARRAY2[J]; + ARRAY2[J] = ARRAY2[J + IGAP]; + ARRAY2[J + IGAP] = TEMP; + J = J - IGAP; + } + else + { + break; + } + } + } + IGAP = IGAP / 2; + } } - void isort1(int_t N, int_t *ARRAY) { -/* + /* * Purpose * ======= * Use quick sort algorithm to sort ARRAY in increasing order. @@ -914,62 +752,72 @@ void isort1(int_t N, int_t *ARRAY) * On exit, contains the sorted array. * */ - int_t IGAP, I, J; - int_t TEMP; - IGAP = N / 2; - while (IGAP > 0) { - for (I = IGAP; I < N; I++) { - J = I - IGAP; - while (J >= 0) { - if (ARRAY[J] > ARRAY[J + IGAP]) { - TEMP = ARRAY[J]; - ARRAY[J] = ARRAY[J + IGAP]; - ARRAY[J + IGAP] = TEMP; - J = J - IGAP; - } else { - break; - } - } - } - IGAP = IGAP / 2; - } + int_t IGAP, I, J; + int_t TEMP; + IGAP = N / 2; + while (IGAP > 0) + { + for (I = IGAP; I < N; I++) + { + J = I - IGAP; + while (J >= 0) + { + if (ARRAY[J] > ARRAY[J + IGAP]) + { + TEMP = ARRAY[J]; + ARRAY[J] = ARRAY[J + IGAP]; + ARRAY[J + IGAP] = TEMP; + J = J - IGAP; + } + else + { + break; + } + } + } + IGAP = IGAP / 2; + } } /* Only log the memory for the buffer space, excluding the LU factors */ -void log_memory(int64_t cur_bytes, SuperLUStat_t *stat) { - stat->current_buffer += (float) cur_bytes; - if (cur_bytes > 0) { - stat->peak_buffer = - SUPERLU_MAX(stat->peak_buffer, stat->current_buffer); +void log_memory(int64_t cur_bytes, SuperLUStat_t *stat) +{ + stat->current_buffer += (float)cur_bytes; + if (cur_bytes > 0) + { + stat->peak_buffer = + SUPERLU_MAX(stat->peak_buffer, stat->current_buffer); } } -void print_memorylog(SuperLUStat_t *stat, char *msg) { +void print_memorylog(SuperLUStat_t *stat, char *msg) +{ printf("__ %s (MB):\n\tcurrent_buffer : %8.2f\tpeak_buffer : %8.2f\n", - msg, stat->current_buffer, stat->peak_buffer); + msg, stat->current_buffer, stat->peak_buffer); } -int compare_pair (const void *a, const void *b) +int compare_pair(const void *a, const void *b) { - return (((struct superlu_pair *) a)->val - ((struct superlu_pair *) b)->val); + return (((struct superlu_pair *)a)->val - ((struct superlu_pair *)b)->val); } int get_thread_per_process() -{ - char* ttemp; +{ + char *ttemp; ttemp = getenv("THREAD_PER_PROCESS"); - if(ttemp) return atoi(ttemp); - else return 1; + if (ttemp) + return atoi(ttemp); + else + return 1; } -int_t -get_max_buffer_size () +int_t get_max_buffer_size() { char *ttemp; - ttemp = getenv ("MAX_BUFFER_SIZE"); + ttemp = getenv("MAX_BUFFER_SIZE"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 5000000; } @@ -980,7 +828,7 @@ get_gpublas_nb () char *ttemp; ttemp = getenv ("GPUBLAS_NB"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 64; } @@ -991,13 +839,12 @@ get_num_gpu_streams () char *ttemp; ttemp = getenv ("NUM_GPU_STREAMS"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 8; } -int_t -get_min (int_t * sums, int_t nprocs) +int_t get_min(int_t *sums, int_t nprocs) { int_t min_ind, min_val; min_ind = 0; @@ -1014,9 +861,8 @@ get_min (int_t * sums, int_t nprocs) return min_ind; } -int_t -static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, - int_t ldp, int_t * sums, int_t * counts, int nprocs) +int_t static_partition(struct superlu_pair *work_load, int_t nwl, int_t *partition, + int_t ldp, int_t *sums, int_t *counts, int nprocs) { //initialization loop for (int i = 0; i < nprocs; ++i) @@ -1024,16 +870,15 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, counts[i] = 0; sums[i] = 0; } - qsort (work_load, nwl, sizeof (struct superlu_pair), compare_pair); + qsort(work_load, nwl, sizeof(struct superlu_pair), compare_pair); // for(int i=0;i= 0; i--) { - int_t ind = get_min (sums, nprocs); + int_t ind = get_min(sums, nprocs); // printf("ind %d\n",ind ); partition[ldp * ind + counts[ind]] = work_load[i].ind; counts[ind]++; sums[ind] += work_load[i].val; - } return 0; @@ -1042,19 +887,18 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, /* * Search for the metadata of the j-th block in a U panel. */ -void -arrive_at_ublock (int_t j, /* j-th block in a U panel */ - int_t * iukp, /* output : point to index[] of j-th block */ - int_t * rukp, /* output : point to nzval[] of j-th block */ - int_t * jb, /* Global block number of block U(k,j). */ - int_t * ljb, /* Local block number of U(k,j). */ - int_t * nsupc,/* supernode size of destination block */ - int_t iukp0, /* input : search starting point */ - int_t rukp0, - int_t * usub, /* U subscripts */ - int_t * perm_u, /* permutation vector from static schedule */ - int_t * xsup, /* for SuperSize and LBj */ - gridinfo_t * grid) +void arrive_at_ublock(int_t j, /* j-th block in a U panel */ + int_t *iukp, /* output : point to index[] of j-th block */ + int_t *rukp, /* output : point to nzval[] of j-th block */ + int_t *jb, /* Global block number of block U(k,j). */ + int_t *ljb, /* Local block number of U(k,j). */ + int_t *nsupc, /* supernode size of destination block */ + int_t iukp0, /* input : search starting point */ + int_t rukp0, + int_t *usub, /* U subscripts */ + int_t *perm_u, /* permutation vector from static schedule */ + int_t *xsup, /* for SuperSize and LBj */ + gridinfo_t *grid) { int_t jj; *iukp = iukp0; /* point to the first block in index[] */ @@ -1075,66 +919,70 @@ arrive_at_ublock (int_t j, /* j-th block in a U panel */ * usub[] - index array for panel U(k,:) */ // printf("iukp %d \n",*iukp ); - *jb = usub[*iukp]; /* Global block number of block U(k,jj). */ + *jb = usub[*iukp]; /* Global block number of block U(k,jj). */ // printf("jb %d \n",*jb ); - *nsupc = SuperSize (*jb); + *nsupc = SuperSize(*jb); // printf("nsupc %d \n",*nsupc ); - *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ *rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj); - Move to block U(k,jj+1) in nzval[] */ + Move to block U(k,jj+1) in nzval[] */ *iukp += *nsupc; } /* Set the pointers to the beginning of U block U(k,j) */ - *jb = usub[*iukp]; /* Global block number of block U(k,j). */ - *ljb = LBj (*jb, grid); /* Local block number of U(k,j). */ - *nsupc = SuperSize (*jb); - *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + *jb = usub[*iukp]; /* Global block number of block U(k,j). */ + *ljb = LBj(*jb, grid); /* Local block number of U(k,j). */ + *nsupc = SuperSize(*jb); + *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ } - /* * Count the maximum size of U(kk,:) that I own locally. * September 28, 2016. * Modified December 4, 2018. */ -static int_t num_full_cols_U -( - int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, - gridinfo_t *grid, int_t *perm_u, - int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */ +int_t num_full_cols_U( + int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u, + int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */ ) { - int_t lk = LBi (kk, grid); + int_t lk = LBi(kk, grid); int_t *usub = Ufstnz_br_ptr[lk]; - if (usub == NULL) return 0; /* code */ + if (usub == NULL) + return 0; /* code */ - int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ - int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - - int_t klst = FstBlockC (kk + 1); + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + + int_t klst = FstBlockC(kk + 1); int_t iukp0 = iukp; int_t rukp0 = rukp; - int_t jb,ljb; + int_t jb, ljb; int_t nsupc; int_t full = 1; int_t full_Phi = 1; int_t temp_ncols = 0; int_t segsize; - for (int_t j = 0; j < nub; ++j) { - - /* Sherry -- no need to search from beginning ?? */ + *ldu = 0; + + for (int_t j = 0; j < nub; ++j) + { + + /* Sherry -- no need to search from beginning ?? */ arrive_at_ublock( - j, &iukp, &rukp, &jb, &ljb, &nsupc, - iukp0, rukp0, usub, perm_u, xsup, grid - ); - for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid); + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { segsize = klst - usub[jj]; - if ( segsize ) ++temp_ncols; - if ( segsize > *ldu ) *ldu = segsize; + if (segsize) + ++temp_ncols; + if (segsize > *ldu) + *ldu = segsize; } } return temp_ncols; @@ -1142,20 +990,20 @@ static int_t num_full_cols_U int_t estimate_bigu_size( int_t nsupers, - int_t**Ufstnz_br_ptr, /* point to U index[] array */ + int_t **Ufstnz_br_ptr, /* point to U index[] array */ Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u, int_t *max_ncols /* Output: Max. number of columns among all U(k,:). - This is used for allocating GEMM V buffer. */ - ) + This is used for allocating GEMM V buffer. */ +) { int_t iam = grid->iam; int_t Pc = grid->npcol; int_t Pr = grid->nprow; - int_t myrow = MYROW (iam, grid); - int_t mycol = MYCOL (iam, grid); - - int_t* xsup = Glu_persist->xsup; + int_t myrow = MYROW(iam, grid); + int_t mycol = MYCOL(iam, grid); + + int_t *xsup = Glu_persist->xsup; int_t ncols = 0; /* Count local number of nonzero columns */ int_t ldu = 0; /* Count max. segment size in one row U(k,:) */ @@ -1163,12 +1011,14 @@ int_t estimate_bigu_size( int_t max_ldu = 0; /* Initialize perm_u */ - for (int i = 0; i < nsupers; ++i) perm_u[i] = i; + for (int i = 0; i < nsupers; ++i) + perm_u[i] = i; - for (int lk = myrow; lk < nsupers; lk += Pr) {/* Go through my block rows */ + for (int lk = myrow; lk < nsupers; lk += Pr) + { /* Go through my block rows */ ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, - xsup, grid, perm_u, &ldu) ); - my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu); + xsup, grid, perm_u, &ldu)); + my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu); } #if 0 my_max_ldu = my_max_ldu*8; //YL: 8 is a heuristic number @@ -1179,116 +1029,247 @@ int_t estimate_bigu_size( MPI_Allreduce(&my_max_ldu, &max_ldu, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); MPI_Allreduce(&ncols, max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); -#if ( PRNTlevel>=1 ) - if ( iam==0 ) { - printf("max_ncols " IFMT ", max_ldu " IFMT ", bigu_size " IFMT "\n", - *max_ncols, max_ldu, max_ldu * (*max_ncols)); - fflush(stdout); +#if (PRNTlevel >= 1) + if (iam == 0) + { + printf("max_ncols " IFMT ", max_ldu " IFMT ", bigu_size " IFMT "\n", + *max_ncols, max_ldu, max_ldu * (*max_ncols)); + fflush(stdout); } #endif - return(max_ldu * (*max_ncols)); + return (max_ldu * (*max_ncols)); } -void quickSort( int_t* a, int_t l, int_t r, int_t dir) +void quickSort(int_t *a, int_t l, int_t r, int_t dir) { - int_t j; - - if( l < r ) - { - // divide and conquer - j = partition( a, l, r, dir); - quickSort( a, l, j-1, dir); - quickSort( a, j+1, r, dir); - } - + int_t j; + + if (l < r) + { + // divide and conquer + j = partition(a, l, r, dir); + quickSort(a, l, j - 1, dir); + quickSort(a, j + 1, r, dir); + } } -int_t partition( int_t* a, int_t l, int_t r, int_t dir) { - int_t pivot, i, j, t; - pivot = a[l]; - i = l; j = r+1; - - if(dir==0){ - while( 1) - { - do ++i; while( a[i] <= pivot && i <= r ); - do --j; while( a[j] > pivot ); - if( i >= j ) break; - t = a[i]; a[i] = a[j]; a[j] = t; - } - t = a[l]; a[l] = a[j]; a[j] = t; - return j; - }else if(dir==1){ - while( 1) - { - do ++i; while( a[i] >= pivot && i <= r ); - do --j; while( a[j] < pivot ); - if( i >= j ) break; - t = a[i]; a[i] = a[j]; a[j] = t; - } - t = a[l]; a[l] = a[j]; a[j] = t; - return j; - } +int_t partition(int_t *a, int_t l, int_t r, int_t dir) +{ + int_t pivot, i, j, t; + pivot = a[l]; + i = l; + j = r + 1; + + if (dir == 0) + { + while (1) + { + do + ++i; + while (a[i] <= pivot && i <= r); + do + --j; + while (a[j] > pivot); + if (i >= j) + break; + t = a[i]; + a[i] = a[j]; + a[j] = t; + } + t = a[l]; + a[l] = a[j]; + a[j] = t; + return j; + } + else if (dir == 1) + { + while (1) + { + do + ++i; + while (a[i] >= pivot && i <= r); + do + --j; + while (a[j] < pivot); + if (i >= j) + break; + t = a[i]; + a[i] = a[j]; + a[j] = t; + } + t = a[l]; + a[l] = a[j]; + a[j] = t; + return j; + } + return 0; } +void quickSortM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) +{ + int_t j; + + if (l < r) + { + // printf("dims: %5d",dims); + // fflush(stdout); + // divide and conquer + j = partitionM(a, l, r, lda, dir, dims); + quickSortM(a, l, j-1, lda, dir, dims); + quickSortM(a, j+1, r, lda, dir, dims); + } +} -void quickSortM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) +int_t partitionM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) { - int_t j; - - if( l < r ) - { - // printf("dims: %5d",dims); - // fflush(stdout); - - // divide and conquer - j = partitionM( a, l, r,lda,dir, dims); - quickSortM( a, l, j-1,lda,dir,dims); - quickSortM( a, j+1, r,lda,dir,dims); - } - + int_t pivot, i, j, t, dd; + pivot = a[l]; + i = l; + j = r + 1; + + if (dir == 0) + { + while (1) + { + do + ++i; + while (a[i] <= pivot && i <= r); + do + --j; + while (a[j] > pivot); + if (i >= j) + break; + for (dd = 0; dd < dims; dd++) + { + t = a[i + lda * dd]; + a[i + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + } + for (dd = 0; dd < dims; dd++) + { + t = a[l + lda * dd]; + a[l + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + return j; + } + else if (dir == 1) + { + while (1) + { + do + ++i; + while (a[i] >= pivot && i <= r); + do + --j; + while (a[j] < pivot); + if (i >= j) + break; + for (dd = 0; dd < dims; dd++) + { + t = a[i + lda * dd]; + a[i + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + } + for (dd = 0; dd < dims; dd++) + { + t = a[l + lda * dd]; + a[l + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + return j; + } + + return 0; +} /* partitionM */ + +int_t **getTreePerm(int_t *myTreeIdxs, int_t *myZeroTrIdxs, + int_t *nodeCount, int_t **nodeList, + int_t *perm_c_supno, int_t *iperm_c_supno, + gridinfo3d_t *grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + int_t **treePerm = SUPERLU_MALLOC(sizeof(int_t *) * maxLvl); + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + // treePerm[lvl] = NULL; + int_t treeId = myTreeIdxs[lvl]; + treePerm[lvl] = getPermNodeList(nodeCount[treeId], nodeList[treeId], + perm_c_supno, iperm_c_supno); + } + return treePerm; +} + +int_t *getMyNodeCounts(int_t maxLvl, int_t *myTreeIdxs, int_t *gNodeCount) +{ + int_t *myNodeCount = INT_T_ALLOC(maxLvl); + for (int i = 0; i < maxLvl; ++i) + { + myNodeCount[i] = gNodeCount[myTreeIdxs[i]]; + } + return myNodeCount; } +/*chekc a vector vec of len across different process grids*/ +int_t checkIntVector3d(int_t *vec, int_t len, gridinfo3d_t *grid3d) +{ + int_t nP = grid3d->zscp.Np; + int_t myGrid = grid3d->zscp.Iam; + int_t *buf = intMalloc_dist(len); -int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) { - int_t pivot, i, j, t, dd; - pivot = a[l]; - i = l; j = r+1; - - if(dir==0){ - while( 1) - { - do ++i; while( a[i] <= pivot && i <= r ); - do --j; while( a[j] > pivot ); - if( i >= j ) break; - for(dd=0;dd= pivot && i <= r ); - do --j; while( a[j] < pivot ); - if( i >= j ) break; - for(dd=0;ddzscp.comm, &status); + + for (int_t i = 0; i < len; ++i) + { + /* code */ + if (buf[i] != vec[i]) + { + /* code */ + printf("Error occured at (%d) Loc %d \n", (int)p, (int)i); + exit(0); + } + } + } + } + else + { + MPI_Send(vec, len, mpi_int_t, 0, myGrid, grid3d->zscp.comm); + } + + return 0; } +/** + * reduce the states from all the two grids before prinitng it out + * See the defenition of enum PhaseType in superlu_enum_const.h + */ +int_t reduceStat(PhaseType PHASE, + SuperLUStat_t *stat, gridinfo3d_t *grid3d) +{ + flops_t *ops = stat->ops; + flops_t flopcnt; + MPI_Reduce(&ops[PHASE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid3d->zscp.comm); + + if (!grid3d->zscp.Iam) + { + ops[PHASE] = flopcnt; + } + + return 0; +} + +/*---- end from 3D code p3dcomm.c ----*/ #ifdef GPU_ACC @@ -1525,3 +1506,36 @@ gemm_division_new (int * num_streams_used, /*number of streams that will be us } #endif /* defined GPU_ACC */ + +/* The following are moved from superlu_gpu.cu */ + +int getnGPUStreams() +{ + // Disabling multiple gpu streams + #if 1 + return 1; + #else + char *ttemp; + ttemp = getenv ("NUM_GPU_STREAMS"); + + if (ttemp) + return atoi (ttemp); + else + return 1; + #endif +} + +int get_mpi_process_per_gpu () +{ + char *ttemp; + ttemp = getenv ("MPI_PROCESS_PER_GPU"); + + if (ttemp) + return atol (ttemp); + else + { + printf("MPI_PROCESS_PER_GPU is not set; Using default 1 \n"); + return 1; + } +} + diff --git a/SRC/util_dist.h b/SRC/util_dist.h index 93dc5640..a52b7c1f 100644 --- a/SRC/util_dist.h +++ b/SRC/util_dist.h @@ -19,6 +19,7 @@ at the top-level directory. #include #include #include + #include "superlu_enum_consts.h" /* @@ -56,7 +57,11 @@ at the top-level directory. #define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) #define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) - +// allocating macros +#define MPI_REQ_ALLOC(x) ((MPI_Request *) SUPERLU_MALLOC ( (x) * sizeof (MPI_Request))) +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + /* * Constants */ @@ -68,6 +73,13 @@ at the top-level directory. #define TRUE (1) #endif +/*==== For 3D code ====*/ +#define MAX_3D_LEVEL 32 /*allows for z dimensions of 2^32*/ +#define CBLOCK 192 +#define CACHE_LINE_SIZE 8 +#define CSTEPPING 8 +/*=====================*/ + /* * Type definitions */ @@ -150,4 +162,132 @@ typedef struct { #define SuperLU_U_NZ_START(col) ( Ustore->colptr[col] ) #define SuperLU_U_SUB(ptr) ( Ustore->rowind[ptr] ) +/*********************************************************************** + * For 3D code */ +/* SCT_t was initially Schur-complement counter to compute different + metrics of Schur-complement Update. + Later, it includes counters to keep track of many other metrics. +*/ +typedef struct +{ + int_t datatransfer_count; + int_t schurPhiCallCount; + int_t PhiMemCpyCounter; + double acc_load_imbal; + double LookAheadGEMMFlOp; + double PhiWaitTimer_2; + double LookAheadGEMMTimer; + double LookAheadRowSepTimer; + double LookAheadScatterTimer; + double GatherTimer ; + double GatherMOP ; + double scatter_mem_op_counter; + double LookAheadRowSepMOP ; + double scatter_mem_op_timer; + double schur_flop_counter; + double schur_flop_timer; + double CPUOffloadTimer; + double PhiWaitTimer; + double NetSchurUpTimer; + double AssemblyTimer; + double PhiMemCpyTimer; + double datatransfer_timer; + double LookAheadScatterMOP; + double schurPhiCallTimer; + double autotunetime; + double *Predicted_acc_sch_time; + double *Predicted_acc_gemm_time; + double *Predicted_acc_scatter_time; + + double trf2_flops; + double trf2_time; + double offloadable_flops; /*flops that can be done on ACC*/ + double offloadable_mops; /*mops that can be done on ACC*/ + + double *SchurCompUdtThreadTime; + double *Predicted_host_sch_time; + double *Measured_host_sch_time; + +#ifdef SCATTER_PROFILE + double *Host_TheadScatterMOP ; + double *Host_TheadScatterTimer; +#endif + +#ifdef OFFLOAD_PROFILE + double *Predicted_acc_scatter_time_strat1; + double *Predicted_host_sch_time_strat1; + size_t pci_transfer_count[18]; /*number of transfers*/ + double pci_transfer_time[18]; /*time for each transfer */ + double pci_transfer_prediction_error[18]; /*error in prediction*/ + double host_sch_time[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/ + double host_sch_flop[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/ +#endif + + double pdgstrs2_timer; + double pdgstrf2_timer; + double lookaheadupdatetimer; + double pdgstrfTimer; + +// new timers for different wait times + //convention: tl suffix refers to times measured from rdtsc + // td : suffix refers to times measured in SuerpLU_timer + + /* diagonal block factorization; part of pdgstrf2; called from thread*/ + // double Local_Dgstrf2_tl; + double *Local_Dgstrf2_Thread_tl; + /*wait for receiving U diagonal block: part of mpf*/ + double Wait_UDiagBlock_Recv_tl; + /*wait for receiving L diagonal block: part of mpf*/ + double Wait_LDiagBlock_Recv_tl; + + + /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */ + double Recv_UDiagBlock_tl; + /*wait for previous U block send to finish; part of pdgstrf2 */ + double Wait_UDiagBlockSend_tl; + /*after obtaining U block, time spent in calculating L panel*/ + double L_PanelUpdate_tl; + /*Synchronous Broadcasting L and U panel*/ + double Bcast_UPanel_tl; + double Bcast_LPanel_tl; + /*Wait for L send to finish */ + double Wait_LSend_tl; + + /*Wait for U send to finish */ + double Wait_USend_tl; + /*Wait for U receive */ + double Wait_URecv_tl; + /*Wait for L receive */ + double Wait_LRecv_tl; + + /*time to get lock*/ + double *GetAijLock_Thread_tl; + + /*U_panelupdate*/ + double PDGSTRS2_tl; + + /*profiling by phases */ + double Phase_Factor_tl; + double Phase_LU_Update_tl; + double Phase_SC_Update_tl; + + /*3D timers*/ + double ancsReduce; /*timer for reducing ancestors before factorization*/ + double gatherLUtimer; /*timer for gather LU factors into bottom layer*/ + double tFactor3D[MAX_3D_LEVEL]; + double tSchCompUdt3d[MAX_3D_LEVEL]; + + /*ASync Profiler timing*/ + double tAsyncPipeTail; + + /*double t_Startup time before factorization starts*/ + double tStartup; + + /*keeping track of data sent*/ + double commVolFactor; + double commVolRed; + +} SCT_t; + + #endif /* __SUPERLU_UTIL */ diff --git a/SRC/zHWPM_CombBLAS.hpp b/SRC/zHWPM_CombBLAS.hpp index 7860e6a5..f45ebfed 100644 --- a/SRC/zHWPM_CombBLAS.hpp +++ b/SRC/zHWPM_CombBLAS.hpp @@ -73,7 +73,7 @@ zGetHWPM(SuperMatrix *A, gridinfo_t *grid, zScalePermstruct_t *ScalePermstruct) { printf("HWPM only supports square process grid. Retuning without a permutation.\n"); } - combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc; + combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc(grid->comm); std::vector< std::vector < std::tuple > > data(procs); /* ------------------------------------------------------------ diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c index e2957379..4c670505 100644 --- a/SRC/zbinary_io.c +++ b/SRC/zbinary_io.c @@ -18,6 +18,7 @@ zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp); printf("# of doubles fread: %d\n", nnz_read); fclose(fp); + return 0; } int @@ -27,7 +28,7 @@ zwrite_binary(int_t n, int_t nnz, FILE *fp1; int nnz_written; size_t isize = sizeof(int_t), dsize = sizeof(double); - fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb"); + fp1 = fopen("cmatrix.bin", "wb"); fwrite(&n, isize, 1, fp1); fwrite(&nnz, isize, 1, fp1); fwrite(colptr, isize, n+1, fp1); @@ -37,4 +38,5 @@ zwrite_binary(int_t n, int_t nnz, printf("dump binary file ... # of doubles fwrite: %d\n", nnz_written); assert(nnz_written == 2*nnz); fclose(fp1); + return 0; } diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c new file mode 100644 index 00000000..55f9c435 --- /dev/null +++ b/SRC/zcommunication_aux.c @@ -0,0 +1,502 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Communication routines. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+#include "superlu_zdefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t zIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t zIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     doublecomplex* uval, gridinfo_t *grid,
+		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, doublecomplex* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, zLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t zIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, doublecomplex* Uval_buf, zLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t zWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t zISend_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, zLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(doublecomplex) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t zIBcast_UDiagBlock(int_t k, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t zIBcast_LDiagBlock(int_t k, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t zUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/zgather.c b/SRC/zgather.c
new file mode 100644
index 00000000..46f80f87
--- /dev/null
+++ b/SRC/zgather.c
@@ -0,0 +1,397 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Various gather routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include 
+#include "superlu_zdefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t zprintMatrix(char*s, int n, int m, doublecomplex* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; ixsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nGPUStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    zgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    zgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // zprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // zprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void zRgather_U( int_t k, int_t jj0, int_t *usub,	doublecomplex *uval,
+                 doublecomplex *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nGPUStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    zgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    zgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* zRgather_U */
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
new file mode 100644
index 00000000..4c97a5b1
--- /dev/null
+++ b/SRC/zlustruct_gpu.h
@@ -0,0 +1,236 @@
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +#include "superlu_zdefs.h" + +#ifdef GPU_ACC // enable GPU +#include "gpublas_utils.h" +// #include "mkl.h" +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NGPU_STREAMS 32 + +static +void check(gpuError_t result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n", + file, line, gpuGetErrorString(result), func); + + // Make sure we call GPU Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + doublecomplex *bigV; + doublecomplex *bigU; + doublecomplex *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + doublecomplex *Remain_L_buff; /* on GPU */ + doublecomplex *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} zSCUbuf_gpu_t; + +/* Holds the L & U data structures on the GPU side */ +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + doublecomplex *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + doublecomplex *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + zSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS]; + doublecomplex *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*gpu events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *ePCIeH2D; + gpuEvent_t *ePCIeD2H_Start; + gpuEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} zLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + zLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU + gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream; + gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS]; + int_t lastOffloadStream[MAX_NGPU_STREAMS]; + int_t nGPUStreams; + int* isNodeInMyGrid; + double acc_async_cost; +} zsluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + zscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + zLUValSubBuf_t **LUvsbs, // size = num Look ahead + zdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + zsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +int zinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + zsluGPU_t *sluGPU, + gridinfo_t *grid, + zLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int zreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + zLUstruct_t *LUstruct); + +extern int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT); +extern int zsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, zsluGPU_t *sluGPU); +extern int zsendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, doublecomplex* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, zsluGPU_t *sluGPU, HyP_t* HyP +); + +extern int zinitSluGPU3D_t( + zsluGPU_t *sluGPU, + zLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int zSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + zsluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void zCopyLUToGPU3D (int* isNodeInMyGrid, zLocalLU_t *A_host, + zsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, zLUValSubBuf_t*LUvsb, + zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + zsluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT); +extern int zfree_LUstruct_gpu (zLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(zsluGPU_t *sluGPU); + +extern void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex *dA, int_t lda ); + +/*to print out various statistics*/ +void zprintGPUStats(zLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +#endif // matching: enable GPU diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c index 38a08080..edcaf944 100644 --- a/SRC/zmemory_dist.c +++ b/SRC/zmemory_dist.c @@ -169,8 +169,6 @@ doublecomplex *doublecomplexCalloc_dist(int_t n) return (buf); } -#if 0 ///////// Sherry - /*************************************** * The following are from 3D code. ***************************************/ @@ -285,4 +283,3 @@ void z3D_printMemUse( trf3Dpartition_t* trf3Dpartition, zLUstruct_t *LUstruct, } } -#endif diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c new file mode 100644 index 00000000..6647d933 --- /dev/null +++ b/SRC/znrformat_loc3d.c @@ -0,0 +1,574 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.1.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ * October 5, 2021
+ */
+
+#include "superlu_zdefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void zGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ doublecomplex *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+		    
+	if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */
+	    A2d->rowptr[0] = 0;
+	    for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+		    {
+			// A2d->rowptr[j] += row_disp[i];
+			A2d->rowptr[j] += nnz_disp[i];
+		    }
+	    }
+	    A2d->nnz_loc = nnz_disp[grid3d->npdep];
+	    A2d->m_loc = row_disp[grid3d->npdep];
+
+	    if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+	    } else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+	    }
+	} /* end 2D layer grid-0 */
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (doublecomplex *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	doublecomplex *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	doublecomplex *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, B1,
+		    b_counts_int, b_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* zGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int zScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    doublecomplex *B = (doublecomplex *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    doublecomplex *B2d = (doublecomplex *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    doublecomplex *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    doublecomplex *Btmp; // on 3D grid
+    Btmp = doublecomplexMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX,
+		     Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid (default) */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
+	*/
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, SuperLU_MPI_DOUBLE_COMPLEX,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+#endif
+	///////////
+	
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* zScatter_B3d */
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
index 526641aa..993de064 100644
--- a/SRC/zreadMM.c
+++ b/SRC/zreadMM.c
@@ -60,7 +60,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
@@ -107,7 +107,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 3/ Read n and nnz */
 #ifdef _LONGINT
-    sscanf(line, "%ld%ld%ld",m, n, nonz);
+    sscanf(line, "%lld%lld%lld", m, n, nonz);
 #else
     sscanf(line, "%d%d%d",m, n, nonz);
 #endif
diff --git a/SRC/zreadtriple.c b/SRC/zreadtriple.c
index a52eae5f..d8e4c9a2 100644
--- a/SRC/zreadtriple.c
+++ b/SRC/zreadtriple.c
@@ -46,7 +46,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
      */
 
 #ifdef _LONGINT
-    fscanf(fp, "%ld%ld%ld", m, n, nonz);
+    fscanf(fp, "%lld%lld%lld", m, n, nonz);
 #else
     fscanf(fp, "%d%d%d", m, n, nonz);
 #endif
@@ -76,7 +76,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 
 #ifdef _LONGINT
-        fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+        fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #else // int 
         fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #endif
@@ -85,8 +85,9 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	    if ( row[0] == 0 || col[0] == 0 ) {
 		zero_base = 1;
 		printf("triplet file: row/col indices are zero-based.\n");
-	    } else
+	    } else {
 		printf("triplet file: row/col indices are one-based.\n");
+     	    }
 
 	if ( !zero_base ) {
 	    /* Change to 0-based indexing. */
diff --git a/SRC/zreadtriple_noheader.c b/SRC/zreadtriple_noheader.c
index 3ddedb01..8410540b 100644
--- a/SRC/zreadtriple_noheader.c
+++ b/SRC/zreadtriple_noheader.c
@@ -48,7 +48,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     nz = *n = 0;
 
 #ifdef _LONGINT
-    ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+    ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #else  // int
     ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #endif
@@ -61,7 +61,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 	++nz;
 
 #ifdef _LONGINT
-        ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i);
+        ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #else  // int
         ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i);
 #endif
@@ -104,7 +104,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz,
     /* Read into the triplet array from a file */
     for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) {
 #ifdef _LONGINT
-	fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
+	fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #else // int32
 	fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i);
 #endif
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
index afcfc2d3..ffe4d25a 100644
--- a/SRC/zscatter.c
+++ b/SRC/zscatter.c
@@ -306,7 +306,7 @@ gemm_division_cpu_gpu(
 )
 {
     int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
@@ -433,7 +433,7 @@ gemm_division_new (int * num_streams_used,   /*number of streams that will be us
     )
 {
     int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
-    int min_gpu_col = get_cublas_nb ();
+    int min_gpu_col = get_gpublas_nb ();
 
     // Ngem = 1000000000;
     /*
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
new file mode 100644
index 00000000..9070fdca
--- /dev/null
+++ b/SRC/zscatter3d.c
@@ -0,0 +1,624 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, doublecomplex *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+zblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    doublecomplex *L_mat, int ldl,
+                    doublecomplex *U_mat, int ldu,
+                    doublecomplex *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling ZGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        zscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+zblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         doublecomplex *L_mat, int_t ldl,
+                         doublecomplex *U_mat, int_t ldu,
+                         doublecomplex *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling ZGEMM */
+    superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        zscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 doublecomplex* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 zLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zgemm_scatterTopLeft */
+
+int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  zLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zblock_gemm_scatterTopRight */
+
+int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    zLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomLeft */
+
+int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     zLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           doublecomplex *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    doublecomplex *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                z_sub(&ucol[indirect[i]], &ucol[indirect[i]], &tempv[i]);
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/zsp_blas2_dist.c b/SRC/zsp_blas2_dist.c
index 4f8990eb..54a55bd3 100644
--- a/SRC/zsp_blas2_dist.c
+++ b/SRC/zsp_blas2_dist.c
@@ -426,9 +426,8 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A,
     }
 
     /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || 
-	z_eq(&alpha, &comp_zero) && 
-	z_eq(&beta, &comp_one))
+    if ( A->nrow == 0 || A->ncol == 0 || 
+	(z_eq(&alpha, &comp_zero) && z_eq(&beta, &comp_one)) )
 	return 0;
 
 
diff --git a/SRC/zsuperlu_blas.c b/SRC/zsuperlu_blas.c
new file mode 100644
index 00000000..2d413761
--- /dev/null
+++ b/SRC/zsuperlu_blas.c
@@ -0,0 +1,122 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_zdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_zgemm(const char *transa, const char *transb,
+                  int m, int n, int k, doublecomplex alpha, doublecomplex *a,
+                  int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return CGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    zgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return zgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_ztrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, doublecomplex *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    ztrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return ztrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_zger(const int m, const int n, const doublecomplex alpha,
+                 const doublecomplex *x, const int incx, const doublecomplex *y,
+                 const int incy, doublecomplex *a, const int lda)
+{
+#ifdef _CRAY
+    CGERU(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    zgeru_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx)
+{
+    zscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_zaxpy(const int n, const doublecomplex alpha,
+    const doublecomplex *x, const int incx, doublecomplex *y, const int incy)
+{
+    zaxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_zgemv(const char *trans, const int m,
+                  const int n, const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, const doublecomplex *x, const int incx,
+                  const doublecomplex beta, doublecomplex *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_ztrsv(char *uplo, char *trans, char *diag,
+                  int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    CTRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
new file mode 100644
index 00000000..4cddd772
--- /dev/null
+++ b/SRC/zsuperlu_gpu.cu
@@ -0,0 +1,1791 @@
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
+ * 
+ */ + +//#define GPU_DEBUG + +#include "mpi.h" +// #include "sec_structs.h" +#include +#include +#include + +#undef Reduce + +//#include + +#include "zlustruct_gpu.h" + +#include "dcomplex.h" + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +// gpublasStatus_t checkGPUblas(gpublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != GPUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result)); +// assert(result == GPUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + doublecomplex *nzval, int_t ldv, + doublecomplex *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ + +__device__ inline +void zdevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + doublecomplex *nzval, int ldv, + const doublecomplex *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + z_sub(&nzval[i], &nzval[i], &tempv[nbrow * col + thread_id_x]); + } + } +} + +/* Sherry: this routine is not used */ +#if 0 ////////////////////////////////////////////// +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif /////////////////////////////////// not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + doublecomplex * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + doublecomplex * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; + z_sub(&ucol[i], &ucol[i], &tempv[nbrow * col + thread_id_x]); + } + } +} + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + zLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + doublecomplex *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + doublecomplex *LnzvalVec = A_gpu->LnzvalVec; + doublecomplex *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + //int THREAD_BLOCK_SIZE =ldt; + + int* pfxStorage = (int*) &IndirectJ3[ldt]; + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + + doublecomplex *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + doublecomplex *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + doublecomplex *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = blockDim.x / temp_nbrow; + + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) + IndirectJ1[thread_id] = 0; + + if (thread_id < blockDim.x) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_lptr, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + doublecomplex *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_lptr[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = blockDim.x / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + zdevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int zSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + zsluGPU_t *sluGPU, gridinfo_t *grid +) +{ + int SCATTER_THREAD_BLOCK_SIZE=512; + + zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + zLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + gpuStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + gpublasHandle_t gpublas_handle0 = sluGPU->gpublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + doublecomplex * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + doublecomplex * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("zSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(doublecomplex), + gpuMemcpyHostToDevice, FunCallStream)) ; + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(doublecomplex), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + gpuMemcpyHostToDevice, FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + checkGPU(gpuMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), gpuMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(doublecomplex) + + bigu_send_size * sizeof(doublecomplex) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; + + /* The following are used in gpublasZgemm() call */ + cuDoubleComplex *cu_alpha = (cuDoubleComplex*) α + cuDoubleComplex *cu_beta = (cuDoubleComplex*) β + cuDoubleComplex *cu_A, *cu_B, *cu_C; /* C <- A*B */ + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + gpublasSetStream(gpublas_handle0, FunCallStream); + gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream); + cu_A = (cuDoubleComplex*) &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row]; + cu_B = (cuDoubleComplex*) &A_gpu->scubufs[streamId].bigU[st_col * ldu]; + cu_C = (cuDoubleComplex*) A_gpu->scubufs[streamId].bigV; + gpublasZgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N, + nrows, ncols, ldu, cu_alpha, + cu_A, Rnbrow, cu_B, ldu, cu_beta, + cu_C, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + gpuEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 8.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + gpuStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + gpuEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end zSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + gpuOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(gpuDeviceProp devProp) +{ + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + gpuMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu) +{ + /* Free the L data structure on GPU */ + checkGPU(gpuFree(A_gpu->LrowindVec)); + checkGPU(gpuFree(A_gpu->LrowindPtr)); + + checkGPU(gpuFree(A_gpu->LnzvalVec)); + checkGPU(gpuFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + + /*freeing the pinned memory*/ + int_t streamId = 0; + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkGPU (gpuFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkGPU(gpuFreeHost(A_gpu->acc_L_buff)); + checkGPU(gpuFreeHost(A_gpu->acc_U_buff)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkGPU(gpuFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + + /* Free the U data structure on GPU */ + checkGPU(gpuFree(A_gpu->UrowindVec)); + checkGPU(gpuFree(A_gpu->UrowindPtr)); + + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated + + checkGPU(gpuFree(A_gpu->UnzvalVec)); + checkGPU(gpuFree(A_gpu->UnzvalPtr)); + + checkGPU(gpuFree(A_gpu->grid)); + + /* Free the Schur complement structure on GPU */ + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigV)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].bigU)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Ublock_info)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkGPU(gpuFree(A_gpu->indirect)); + // checkGPU(gpuFree(A_gpu->indirect2)); + checkGPU(gpuFree(A_gpu->xsup)); + + checkGPU(gpuFree(A_gpu->scubufs[streamId].lsub)); + checkGPU(gpuFree(A_gpu->scubufs[streamId].usub)); + + checkGPU(gpuFree(A_gpu->local_l_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->jib_lookupVec)); + checkGPU(gpuFree(A_gpu->jib_lookupPtr)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoVec)); + checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr)); + checkGPU(gpuFree(A_gpu->ijb_lookupVec)); + checkGPU(gpuFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex * dA, int_t lda ) +{ + doublecomplex *cPtr = (doublecomplex *) malloc(sizeof(doublecomplex) * lda * n); + checkGPU(gpuMemcpy( cPtr, dA, + lda * n * sizeof(doublecomplex), gpuMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void zprintGPUStats(zLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + gpuEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + +/* Initialize the GPU side of the data structure. */ +int zinitSluGPU3D_t( + zsluGPU_t *sluGPU, // LU structures on GPU, see zlustruct_gpu.h + zLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + checkGPUErrors(gpuDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nGPUStreams = getnGPUStreams(); + + int SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) + { + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + + } + + if (grid3d->iam == 0) + { + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d gpu streams \n", sluGPU->nGPUStreams); + fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + gpuStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + gpuStreamCreate(&(sluGPU->funCallStreams[streamId])); + gpublasCreate(&(sluGPU->gpublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (zLUstruct_gpu_t *) malloc (sizeof(zLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ + zCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end zinitSluGPU3D_t */ + + +int zinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + zsluGPU_t *sluGPU, gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nGPUStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* zinitD2Hreduce */ + +int zreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + zLUstruct_t *LUstruct +) +{ + zLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + + gpuStream_t CopyStream; + zLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + doublecomplex *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + doublecomplex alpha = {1.0, 0.0}; + superlu_zaxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + doublecomplex *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + doublecomplex alpha = {1.0, 0.0}; + superlu_zaxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* zreduceGPUlu */ + + +int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int zsendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + gpuStream_t CopyStream = sluGPU->CopyStream;; + zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + gpuEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(doublecomplex), gpuMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkGPU(gpuMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(doublecomplex), gpuMemcpyDeviceToHost, CopyStream ) ); + gpuEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(doublecomplex) + l_copy_len * sizeof(doublecomplex); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int zsendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + doublecomplex* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + zsluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] zsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in zsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(doublecomplex)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(doublecomplex)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(zsluGPU_t *sluGPU) +{ + return 0; +} +#endif + +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling zfree_LUsstruct_gpu(). */ +void zCopyLUToGPU3D ( + int* isNodeInMyGrid, + zLocalLU_t *A_host, /* distributed LU structure on host */ + zsluGPU_t *sluGPU, /* hold LU structure on GPU */ + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + zLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + gpuDeviceProp devProp; + gpuGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nGPUStreams = sluGPU->nGPUStreams; + /*pinned memory allocations. + Paged-locked memory by gpuMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nGPUStreams; streamId++ ) { + void *tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkGPUErrors(gpuMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, remain_l_max * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (doublecomplex *) tmp_ptr; + checkGPUErrors(gpuMallocHost( &tmp_ptr, bigu_size * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].bigU_host = (doublecomplex *) tmp_ptr; + + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (doublecomplex *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (doublecomplex *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkGPUErrors(gpuMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, remain_l_max * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (doublecomplex *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(doublecomplex); + checkGPUErrors(gpuMalloc( &tmp_ptr, bigu_size * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].bigU = (doublecomplex *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(doublecomplex); + checkGPUErrors(gpuMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, buffer_size * sizeof(doublecomplex))) ; + A_gpu->scubufs[streamId].bigV = (doublecomplex *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(doublecomplex); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkGPUErrors(gpuMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->GemmEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ScatterEnd = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeH2D = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (gpuEvent_t *) SUPERLU_MALLOC(sizeof(gpuEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmStart[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->GemmEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ScatterEnd[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkGPUErrors(gpuEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkGPUErrors(gpuMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(doublecomplex) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(doublecomplex) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(doublecomplex) * 1e-9, + u_val_len * sizeof(doublecomplex) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_val_len * sizeof(doublecomplex))); + A_gpu->LnzvalVec = (doublecomplex *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(doublecomplex))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_val_len * sizeof(doublecomplex))); + A_gpu->UnzvalVec = (doublecomplex *) tmp_ptr; + checkGPUErrors(gpuMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(doublecomplex))); + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkGPUErrors(gpuMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), gpuMemcpyHostToDevice)) ; + + checkGPUErrors(gpuMalloc( &tmp_ptr, sizeof(zLUstruct_gpu_t))) ; + *dA_gpu = (zLUstruct_gpu_t *) tmp_ptr; + checkGPUErrors(gpuMemcpy( *dA_gpu, A_gpu, sizeof(zLUstruct_gpu_t), gpuMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end zCopyLUToGPU3D */ + + + +int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + zLUValSubBuf_t*LUvsb, + zLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + zsluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all gpu streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + zinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + zreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + zreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* zreduceAllAncestors3d_GPU */ + + +void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nGPUStreams; streamId++) + { + double ttx = SuperLU_timer_(); + gpuStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} diff --git a/SRC/zsuperlu_gpu.hip.cpp b/SRC/zsuperlu_gpu.hip.cpp new file mode 100644 index 00000000..0d7a6dee --- /dev/null +++ b/SRC/zsuperlu_gpu.hip.cpp @@ -0,0 +1 @@ +#include "zsuperlu_gpu.cu" \ No newline at end of file diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c new file mode 100644 index 00000000..517a176e --- /dev/null +++ b/SRC/ztreeFactorization.c @@ -0,0 +1,762 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_zdefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t zLluBufInit(zLUValSubBuf_t* LUvsb, zLUstruct_t *LUstruct)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    zdiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (zdiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(zdiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (zdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(zdiagFactBufs_t));
+        assert(dFBufs[i]);
+        zinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct)
+{
+    zLUValSubBuf_t** LUvsbs = (zLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(zLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (zLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
+        zLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  zscuBufs_t* scuBufs,
+                  zLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = zgetBigV(ldt, num_threads);
+    scuBufs->bigU = zgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int zfreeScuBufs(zscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t zdenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    zscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    zLUValSubBuf_t* LUvsb,
+    zdiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	zDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	zIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	zUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	zIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	zWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	zWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= zSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            doublecomplex* bigV = scuBufs->bigV;
+            doublecomplex* bigU = scuBufs->bigU;
+
+#ifdef _OPENMP    
+#pragma omp parallel for schedule(dynamic)
+#endif
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                doublecomplex** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                doublecomplex *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                doublecomplex *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                zblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* zdenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t zsparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    zscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    zdiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    /* Test the input parameters. */
+    *info = 0;
+    
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zsparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = zSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            doublecomplex* bigV = scuBufs->bigV;
+	    
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    zblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#ifdef _OPENMP    
+#pragma omp parallel
+#endif
+            {
+#ifdef _OPENMP    
+#pragma omp for schedule(dynamic,2) nowait
+#endif
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    zblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    zLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    zUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit zsparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* zsparseTreeFactor_ASYNC */
diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c
new file mode 100644
index 00000000..77044ba3
--- /dev/null
+++ b/SRC/ztreeFactorizationGPU.c
@@ -0,0 +1,758 @@
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "zlustruct_gpu.h" + +//#include "cblas.h" + +#ifdef GPU_ACC ///////////////// enable GPU + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +#if 0 +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} +#endif + +int zsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + zscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + zLUValSubBuf_t **LUvsbs, // size = num Look ahead + zdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + zsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nGPUStreams = sluGPU->nGPUStreams; // number of gpu streams + + if (superlu_acc_offload) + zsyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + zinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + zinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = zSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + doublecomplex *bigV = scuBufs->bigV; + doublecomplex *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#ifdef _OPENMP +#pragma omp parallel /* Look-ahead update on CPU */ +#endif + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + zblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + zblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#ifdef _OPENMP +#pragma omp for +#endif + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + zblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + zinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + zreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#ifdef _OPENMP +#pragma omp parallel +#endif + { + /* Master thread performs Schur complement update on GPU. */ +#ifdef _OPENMP +#pragma omp master +#endif + { + if (superlu_acc_offload) + { +#ifdef _OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nGPUStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + zwaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + zsendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + zSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#ifdef _OPENMP +#pragma omp for +#endif + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + zblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + zUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end zsparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c new file mode 100644 index 00000000..184d0e23 --- /dev/null +++ b/SRC/ztrfAux.c @@ -0,0 +1,756 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Auxiliary routine for 3D factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
+#include "superlu_zdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nGPUStreams =0;
+} /* zInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, zLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    zinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t zSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    doublecomplex *bigU,
+    int_t* Lsub_buf,
+    doublecomplex *Lval_buf,
+    int_t* Usub_buf,
+    doublecomplex *Uval_buf,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    doublecomplex* uval;
+    int_t* lsub;
+    doublecomplex* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        zgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* zSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t zSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    zscuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, zLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    doublecomplex * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    doublecomplex * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    doublecomplex* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    doublecomplex* uval;
+    int_t* lsub;
+    doublecomplex* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        zRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            zRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* zSchurComplementSetupGPU */
+
+
+doublecomplex* zgetBigV(int_t ldt, int_t num_threads)
+{
+    doublecomplex *bigV;
+    if (!(bigV = doublecomplexMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    doublecomplex* bigU = doublecomplexMalloc_dist(bigu_size);
+
+    return bigU;
+} /* zgetBigU */
+
+
+trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      zLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter zinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    zinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    zLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
+    zLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit zinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* zinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit zDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
new file mode 100644
index 00000000..3dc098f1
--- /dev/null
+++ b/SRC/ztrfCommWrapper.c
@@ -0,0 +1,547 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_zdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     doublecomplex *BlockUFactor,
+                     doublecomplex *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     zLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        zIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        zIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Zgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        zPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        zISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        zISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
+		      doublecomplex* BlockUFactor,
+		      gridinfo_t *grid,
+		      zLUstruct_t *LUstruct)
+{
+    doublecomplex alpha = {1.0, 0.0};
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        doublecomplex* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic) nowait
+#endif	
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif	    
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        doublecomplex* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+#ifdef _OPENMP    
+        // #pragma omp parallel for
+#endif	
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#ifdef _OPENMP    
+//#pragma omp task
+#endif
+            {
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* zLPanelTrSolve */
+
+int_t zLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    doublecomplex* BlockUFactor,
+                    gridinfo_t *grid,
+                    zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    zUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    zLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* zLPanelUpdate */
+
+#define BL  32
+
+int_t zUPanelTrSolve( int_t k,  
+                     doublecomplex* BlockLFactor,
+                     doublecomplex* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     zLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        doublecomplex *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        doublecomplex *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+#ifdef _OPENMP    
+        // #pragma omp for schedule(dynamic,2) nowait
+#endif	
+        for (int_t b = 0; b < nb; ++b)
+        {
+#ifdef _OPENMP    
+            #pragma omp task
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif		
+                doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
+                zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        doublecomplex *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            doublecomplex *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            doublecomplex *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+#ifdef _OPENMP    
+                #pragma omp task
+#endif
+                {
+#ifdef _OPENMP		
+                    int thread_id = omp_get_thread_num();
+#else		    
+                    int thread_id = 0;
+#endif		    
+                    doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
+                    zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* zUPanelTrSolve */
+
+int_t zUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    doublecomplex* BlockLFactor,
+                    doublecomplex* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    zLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    zUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t zIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    doublecomplex* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    zLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        doublecomplex* lusup = Lnzval_bc_ptr[lk];
+
+        zIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(doublecomplex));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            zIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, doublecomplex* Uval_buf,
+    			     gridinfo_t *grid, zLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        doublecomplex* uval = Unzval_br_ptr[lk];
+        zIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(doublecomplex));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            zIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t zWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            zWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t zWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            zWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
index 6ed2eb97..6688710a 100644
--- a/SRC/zutil_dist.c
+++ b/SRC/zutil_dist.c
@@ -13,10 +13,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * 
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include 
@@ -393,6 +393,7 @@ void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, doublecomp
 
     return;
 }
+/**** end utilities added for SUNDIALS ****/
 
 /*! \brief Allocate storage in ScalePermstruct */
 void zScalePermstructInit(const int_t m, const int_t n,
@@ -421,9 +422,65 @@ void zScalePermstructFree(zScalePermstruct_t *ScalePermstruct)
         SUPERLU_FREE(ScalePermstruct->R);
         SUPERLU_FREE(ScalePermstruct->C);
         break;
+      default: break;
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int zDeAllocGlu_3d(zLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+/* Free the replicated data on 3D process layer that is not grid-0 */
+int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* zDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void
@@ -432,9 +489,14 @@ zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx)
     int  i, j;
     for (j = 0; j < nrhs; ++j)
 	for (i = 0; i < n; ++i) {
-	    if ( i % 2 ) x[i + j*ldx].r = 1.0;
-	    else x[i + j*ldx].r = 2.0;
-	    x[i + j*ldx].i = 0.0;
+	    if ( i % 2 ) {
+	        x[i + j*ldx].r = 1.0 + (double)(i+1.)/n;
+		x[i + j*ldx].i = 1.0;
+	    }
+	    else {
+	        x[i + j*ldx].r = 2.0 + (double)(i+1.)/n;
+	        x[i + j*ldx].i = 2.0;
+            }
 	}
 }
 
@@ -557,7 +619,7 @@ void zPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 
 /*! \brief Sets all entries of matrix L to zero.
  */
-void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+void zZeroLblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct)
 {
     doublecomplex zero = {0.0, 0.0};
     register int extra, gb, j, lb, nsupc, nsupr, ncb;
@@ -587,7 +649,7 @@ void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
             }
 	}
     }
-} /* zZeroLblocks */
+} /* end zZeroLblocks */
 
 
 /*! \brief Dump the factored matrix L using matlab triple-let format
@@ -596,8 +658,8 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, zLocalLU_t *Llu)
 {
     register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-	int_t nnzL, n,nmax;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
     int_t *xsup = Glu_persist->xsup;
     int_t *index;
     doublecomplex *nzval;
@@ -650,7 +712,7 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		}
 
 	if(grid->iam==0){
-		fprintf(fp, "%d %d %d\n", n,n,nnzL);
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
 	}
 
      ncb = nsupers / grid->npcol;
@@ -686,7 +748,6 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 } /* zDumpLblocks */
 
 
-
 /*! \brief Print the blocks in the factored matrix U.
  */
 void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
@@ -726,7 +787,37 @@ void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid,
 	    printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]);
 	}
     }
-} /* ZPRINTUBLOCKS */
+} /* end zPrintUlocks */
+
+/*! \brief Sets all entries of matrix U to zero.
+ */
+void zZeroUblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    doublecomplex zero = {0.0, 0.0};
+    register int i, extra, lb, len, nrb;
+    register int myrow, r;
+    zLocalLU_t *Llu = LUstruct->Llu;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    doublecomplex *nzval;
+    int nsupers = Glu_persist->supno[n-1] + 1;
+
+    nrb = nsupers / grid->nprow;
+    extra = nsupers % grid->nprow;
+    myrow = MYROW( iam, grid );
+    if ( myrow < extra ) ++nrb;
+    for (lb = 0; lb < nrb; ++lb) {
+	index = Llu->Ufstnz_br_ptr[lb];
+	if ( index ) { /* Not an empty row */
+	    nzval = Llu->Unzval_br_ptr[lb];
+	    len = index[1];  // number of entries in nzval[];
+	    for (i = 0; i < len; ++i) {
+	        nzval[i] = zero;
+	    }
+	}
+    }
+} /* end zZeroUlocks */
 
 int
 zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm,
diff --git a/TEST/pdtest.c b/TEST/pdtest.c
index c10fc127..fd0dcf24 100644
--- a/TEST/pdtest.c
+++ b/TEST/pdtest.c
@@ -357,7 +357,7 @@ int main(int argc, char *argv[])
 		        PStatFree(&stat);
 #if 0
 		        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-				     nrhs, b, ldb, xtrue, ldx, &grid);
+				     nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 		        if ( info ) {
 			    printf(FMT3, "pdgssvx",info,izero,n,nrhs,imat,nfail);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[])
 			    dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
 					  &result[2]);
 			    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-					 nrhs, b, ldb, xtrue, ldx, &grid);
+					 nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 
 			    /* Print information about the tests that did
diff --git a/TEST/pztest.c b/TEST/pztest.c
index 6b430749..e3f57bc5 100644
--- a/TEST/pztest.c
+++ b/TEST/pztest.c
@@ -357,7 +357,7 @@ int main(int argc, char *argv[])
 		        PStatFree(&stat);
 #if 0
 		        pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-				     nrhs, b, ldb, xtrue, ldx, &grid);
+				     nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 		        if ( info ) {
 			    printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail);
@@ -375,7 +375,7 @@ int main(int argc, char *argv[])
 			    dgst04(n, nrhs, solx, ldx, xact, ldx, rcond,
 					  &result[2]);
 			    pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-					 nrhs, b, ldb, xtrue, ldx, &grid);
+					 nrhs, b, ldb, xtrue, ldx, grid.comm);
 #endif
 
 			    /* Print information about the tests that did
diff --git a/cmake/XSDKDefaults.cmake b/cmake/XSDKDefaults.cmake
index 20eea804..d454d789 100644
--- a/cmake/XSDKDefaults.cmake
+++ b/cmake/XSDKDefaults.cmake
@@ -98,9 +98,9 @@ SET(USE_XSDK_DEFAULTS  ${USE_XSDK_DEFAULTS_DEFAULT}  CACHE  BOOL
   "Use XSDK defaults and behavior.")
 PRINT_VAR(USE_XSDK_DEFAULTS)
 
-SET(XSDK_ENABLE_C  TRUE)
-SET(XSDK_ENABLE_CXX  TRUE)
-SET(XSDK_ENABLE_Fortran  TRUE)
+SET(XSDK_ENABLE_C  ON)
+SET(XSDK_ENABLE_CXX  ON)
+SET(XSDK_ENABLE_Fortran  ON)
 
 # Handle the compiler and flags for a language
 MACRO(XSDK_HANDLE_LANG_DEFAULTS  CMAKE_LANG_NAME  ENV_LANG_NAME
diff --git a/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh b/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh
new file mode 100644
index 00000000..96784830
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_cori_gpu_openmpi4.sh
@@ -0,0 +1,29 @@
+module unload cray-mpich/7.7.6
+module swap PrgEnv-intel PrgEnv-gnu
+export MKLROOT=/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64
+
+# module use /global/common/software/m3169/cori/modulefiles
+# module unload openmpi
+module load cmake/3.18.2
+module load cuda/10.2.89
+module load openmpi/4.0.3
+
+
+export OMP_NUM_THREADS=1
+export NUM_GPU_STREAMS=1
+# srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/g20.rua
+
+
+export NSUP=128
+export NREL=20
+# for MAT in big.rua 
+# for MAT in g4.rua 
+for MAT in s1_mat_0_126936.bin 
+# for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin 
+# for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
+# for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
+do
+srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../../matrix/$MAT
+done 
+
diff --git a/example_scripts/batch_script_mpi_runit_traverse_sml.sh b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
new file mode 100644
index 00000000..6bae66bf
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+#SBATCH --qos=test
+#SBATCH -N 1
+#SBATCH -t 00:30:00
+#SBATCH -J superlu_test
+#SBATCH --mail-user=liuyangzhuan@lbl.gov
+#SBATCH --gpus=1
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt
+module load cmake
+
+#srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/big.rua
+srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../../matrix/HTS/copter2.mtx
diff --git a/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh b/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
index cced19fb..8be323e0 100644
--- a/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
+++ b/example_scripts/run_cmake_build_cori_gpu_openmpi4.sh
@@ -36,10 +36,11 @@ cmake .. \
     -DCMAKE_Fortran_COMPILER=mpif90 \
 	-DCMAKE_INSTALL_PREFIX=. \
 	-DTPL_ENABLE_CUDALIB=ON \
+	-DTPL_ENABLE_LAPACKLIB=ON \
 	-DCMAKE_BUILD_TYPE=Release \
 	-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
 	-DCMAKE_CXX_FLAGS="-Ofast -DRELEASE ${INC_VTUNE}" \
-    -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 ${INC_VTUNE} -DGPU_ACC -I${CUDA_ROOT}/include" \
+    -DCMAKE_C_FLAGS="-std=c11 -DGPU_SOLVE -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 ${INC_VTUNE} -DGPU_ACC -I${CUDA_ROOT}/include" \
 	-DCMAKE_CUDA_FLAGS="--disable-warnings -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DGPU_ACC -gencode arch=compute_70,code=sm_70 -I/usr/common/software/openmpi/4.0.3/gcc/8.3.0/cuda/10.2.89/include"
 make pddrive			
 #	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so"
diff --git a/example_scripts/run_cmake_build_perlmutter_gcc.sh b/example_scripts/run_cmake_build_perlmutter_gcc.sh
new file mode 100644
index 00000000..38cd2bcb
--- /dev/null
+++ b/example_scripts/run_cmake_build_perlmutter_gcc.sh
@@ -0,0 +1,31 @@
+module load PrgEnv-gnu
+module load cpe-cuda
+module load cuda
+module load cmake/git-20210830
+rm -rf build
+mkdir build
+cd build
+cmake .. \
+     -DTPL_PARMETIS_LIBRARIES=ON \
+     -DTPL_PARMETIS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/include;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/metis/include" \
+     -DTPL_PARMETIS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a" \
+     -DTPL_ENABLE_COMBBLASLIB=ON \
+     -DTPL_COMBBLAS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/include;/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/CombBLAS-20211019/Applications/BipartiteMatchings" \
+     -DTPL_COMBBLAS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/lib/libCombBLAS.a" \
+     -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DAdd_" \
+     -DCMAKE_C_COMPILER=cc \
+     -DCMAKE_CXX_COMPILER=CC \
+     -DXSDK_ENABLE_Fortran=ON \
+     -DCMAKE_CUDA_ARCHITECTURES=80 \
+     -DCMAKE_CUDA_FLAGS="-I${MPICH_DIR}/include" \
+     -DTPL_ENABLE_CUDALIB=TRUE \
+     -DTPL_CUDA_LIBRARIES="/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcublas.so;/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcudart.so" \
+     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
+     -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
+     -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \
+     -DBUILD_SHARED_LIBS=OFF \
+     -DCMAKE_INSTALL_PREFIX=. \
+     -DMPIEXEC_NUMPROC_FLAG=-n \
+     -DMPIEXEC_EXECUTABLE=/usr/bin/srun \
+     -DMPIEXEC_MAX_NUMPROCS=16
+make pddrive
\ No newline at end of file
diff --git a/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
new file mode 100644
index 00000000..1189d97b
--- /dev/null
+++ b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Bash script to submit many files to Cori/Edison/Queue
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64 
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt 
+#module load cmake
+
+
+export PATH=/home/yl33/cmake-3.20.3/bin/:$PATH
+cmake --version
+export PARMETIS_ROOT=~/petsc_master/traverse-pgi-openmpi-199-gpucuda-branch/
+
+export CUDA_ROOT=/usr/local/cuda-11.2
+#export CUDA_PATH=${CUDA_ROOT}
+rm -rf CMakeCache.txt
+rm -rf CMakeFiles
+rm -rf CTestTestfile.cmake
+rm -rf cmake_install.cmake
+rm -rf DartConfiguration.tcl 
+
+cmake .. \
+	-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include" \
+	-DTPL_PARMETIS_LIBRARIES="${PARMETIS_ROOT}/lib/libparmetis.a;${PARMETIS_ROOT}/lib/libmetis.a" \
+	-DBUILD_SHARED_LIBS=OFF \
+	-DCMAKE_C_COMPILER=mpicc \
+	-DTPL_ENABLE_CUDALIB=TRUE \
+	-DTPL_ENABLE_LAPACKLIB=TRUE \
+	-Denable_openmp:BOOL=TRUE \
+	-DCMAKE_CUDA_FLAGS="-ccbin pgc++ -D_PGIC_PRINCETON_OVERRIDE_" \
+	-DCMAKE_CUDA_HOST_COMPILER=mpicc \
+	-DCMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES="/usr/local/cuda-11.2/include" \
+	-DCMAKE_INCLUDE_SYSTEM_FLAG_C="-I" \
+	-DCMAKE_CXX_COMPILER=mpiCC \
+	-DCMAKE_INSTALL_PREFIX=. \
+	-DCMAKE_BUILD_TYPE=Release \
+	-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
+        -DTPL_BLAS_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DTPL_LAPACK_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DCMAKE_CXX_FLAGS="-DRELEASE -pgf90libs" \
+        -DCMAKE_C_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -pgf90libs"
+
+make pddrive
+make install
+
+#	-DXSDK_ENABLE_Fortran=FALSE \ 
+
+
+#	-DCUDAToolkit_LIBRARY_ROOT="${CUDA_ROOT}" \
+
+#salloc -N 1 --qos=test -t 0:30:00 --gpus=2
+
+
+
+
+#        -DCMAKE_CUDA_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DGPU_ACC -gencode arch=compute_70,code=sm_70"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so" \
+#        -DCMAKE_CXX_FLAGS="-g -trace -Ofast -std=c++11 -DAdd_ -DRELEASE -tcollect -L$VT_LIB_DIR -lVT $VT_ADD_LIBS" \
+
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_lapack95_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_blas95_lp64.a"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a"  
+
+
+# DCMAKE_BUILD_TYPE=Release or Debug compiler options set in CMAKELIST.txt
+
+#        -DCMAKE_C_FLAGS="-g -O0 -std=c99 -DPRNTlevel=2 -DPROFlevel=1 -DDEBUGlevel=0" \
+        #-DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 ${INC_VTUNE}" \
diff --git a/make.inc.in b/make.inc.in
index 097a38a9..858ab0a3 100644
--- a/make.inc.in
+++ b/make.inc.in
@@ -9,32 +9,42 @@
 #  Creation date:   March 1, 2016	version 5.0.0
 #
 #  Modified:	    October 13, 2017    version 5.2.1
-#		    February 23, 2020   version 6.3.0
+#		    February 20, 2021   version 7.0.0
+#		    October 5, 2021     version 7.1.0
 #
 ############################################################################
 #
 #  The name of the libraries to be created/linked to
 #
 SuperLUroot = ${CMAKE_INSTALL_PREFIX}
-DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
-INCLUDEDIR  = -I$(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@ 
-INCLUDEDIR      += -I$(SuperLUroot)/../SRC
-
-XSDK_INDEX_SIZE=@XSDK_INDEX_SIZE@
-SLU_HAVE_LAPACK=@SLU_HAVE_LAPACK@
-HAVE_PARMETIS=@HAVE_PARMETIS@
-HAVE_COMBBLAS=@HAVE_COMBBLAS@
-HAVE_CUDA=@HAVE_CUDA@
-HAVE_HIP=@HAVE_HIP@
-
-LIBS 	    = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm #-lmpi
-LIBS	    += ${LAPACK_LIB_EXPORT}
-LIBS	    += ${PARMETIS_LIB_EXPORT}
-LIBS 	    += ${COMBBLAS_LIB_EXPORT}
-LIBS 	    += ${EXTRA_LIB_EXPORT}
-LIBS        += ${EXTRA_FLIB_EXPORT}
-CUDALIBS    = ${CUDA_LIB_EXPORT}
-LIBS        += ${CUDA_LIB_EXPORT}
+#DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT}
+DSUPERLULIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_EXPORT}
+INCLUDEDIR  = $(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@
+
+XSDK_INDEX_SIZE = @XSDK_INDEX_SIZE@
+SLU_HAVE_LAPACK = @SLU_HAVE_LAPACK@
+HAVE_PARMETIS   = @HAVE_PARMETIS@
+HAVE_COMBBLAS   = @HAVE_COMBBLAS@
+HAVE_CUDA       = @HAVE_CUDA@
+HAVE_HIP        = @HAVE_HIP@
+
+XSDK_ENABLE_Fortran = @XSDK_ENABLE_Fortran@
+ifeq ($(XSDK_ENABLE_Fortran),ON)
+  DFORTRANLIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_FORTRAN}
+  LIBS = $(DFORTRANLIB) $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm
+  LIBS += ${EXTRA_FLIB_EXPORT}
+else
+  LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm
+endif
+
+LIBS	 += ${LAPACK_LIB_EXPORT}
+LIBS	 += ${PARMETIS_LIB_EXPORT}
+LIBS 	 += ${COMBBLAS_LIB_EXPORT}
+LIBS 	 += ${EXTRA_LIB_EXPORT}
+# LIBS     += ${CUDA_LIB_EXPORT}
+
+CUDALIBS = ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}
+LIBS     += $(CUDALIBS)
 
 #
 #  The archiver and the flag(s) to use when building archive (library)
@@ -45,17 +55,18 @@ ARCHFLAGS    = cr
 RANLIB       = @CMAKE_RANLIB@
 
 CC           = @CMAKE_C_COMPILER@
-NVCC           = @CMAKE_CUDA_COMPILER@
-CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@
-CUDACFLAGS 	 = @CMAKE_CUDA_FLAGS@
+CFLAGS 	     = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ${SHARED_C_FLAGS_EXPORT}
+##@CMAKE_SHARED_LIBRARY_C_FLAGS@
 #CFLAGS      += -D${DirDefs}
 # CFLAGS     += @COMPILE_DEFINITIONS@
 CXX          = @CMAKE_CXX_COMPILER@
 CXXFLAGS     = @CMAKE_CXX_FLAGS_RELEASE@ @CMAKE_CXX_FLAGS@
 NVCC	     = @CMAKE_CUDA_COMPILER@
-CUDACFLAGS   = @CMAKE_CUDA_FLAGS@
+NVCCFLAGS    = @CMAKE_CUDA_FLAGS@
+
 NOOPTS       = -O0
 FORTRAN	     = @CMAKE_Fortran_COMPILER@
+FFLAGS	     = @CMAKE_Fortran_FLAGS@ @Fortrtan_INCLUDES@
 
 LOADER       = @CMAKE_CXX_COMPILER@
-LOADOPTS     = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@
+LOADOPTS     = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@ @CMAKE_Fortran_LINK_FLAGS@
diff --git a/run_cmake_build.sh b/run_cmake_build.sh
index fa271b7d..b318c0ce 100755
--- a/run_cmake_build.sh
+++ b/run_cmake_build.sh
@@ -7,6 +7,8 @@ then
     
 elif [ "$NERSC_HOST" == "cori" ]
 then
+#    rm -fr 64-build; mkdir 64-build; cd 64-build;
+#    export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3-64
     rm -fr cori-build; mkdir cori-build; cd cori-build;
     export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3
 #    export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/shared-build
@@ -55,7 +57,10 @@ then
     -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=0 -DDEBUGlevel=0" \
     -DCMAKE_C_COMPILER=mpicc \
     -DCMAKE_CXX_COMPILER=mpicxx \
+    -DCMAKE_CXX_FLAGS="-std=c++11" \
     -DCMAKE_Fortran_COMPILER=mpif90 \
+    -DCMAKE_LINKER=mpicxx \
+    -Denable_openmp=ON \
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \