From 368165646f648a967cbfbdb9664c27f3d7330f9c Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 24 Jan 2022 10:58:50 -0800 Subject: [PATCH] Change the counters used in TriSolve communication from 'int_t' to 'int': fmod[], frecv[], bmod[], brecv[], etc. in xLocalLU_t{..} structure. Change from 'int_t' to 'int' for several variable/arrays in mc64ad_dist.c. --- .github/workflows/test.yml | 2 +- .travis.yml | 2 +- CMakeLists.txt | 7 +- EXAMPLE/pddrive.c | 10 +- EXAMPLE/pzdrive.c | 2 + SRC/comm_tree.c | 6 + SRC/communication_aux.c | 6 +- SRC/ddistribute.c | 66 +- SRC/dldperm_dist.c | 16 +- SRC/dlustruct_gpu.h | 5 +- SRC/dsp_blas2_dist.c | 33 +- SRC/dsuperlu_gpu.cu | 18 +- SRC/dutil_dist.c | 6 +- SRC/gpu_api_utils.h | 3 +- SRC/gpu_wrapper.h | 4 + SRC/mc64ad_dist.c | 137 +- SRC/memory.c | 17 + SRC/pddistribute.c | 63 +- SRC/pdgssvx.c | 67 +- SRC/pdgstrf.c | 38 +- SRC/pdgstrf3d.c | 2 +- SRC/pdgstrs.c | 56 +- SRC/pdgstrs1.c | 47 +- SRC/pdgstrs_Bglobal.c | 38 +- SRC/pdgstrs_lsum.c | 98 +- SRC/pdgstrs_lsum_cuda.cu | 11 +- SRC/pdsymbfact_distdata.c | 72 +- SRC/psdistribute.c | 121 +- SRC/psgssvx.c | 55 +- SRC/psgstrf.c | 33 +- SRC/psgstrf3d.c | 2 +- SRC/psgstrs.c | 191 +-- SRC/psgstrs1.c | 47 +- SRC/psgstrs_Bglobal.c | 38 +- SRC/psgstrs_lsum.c | 103 +- SRC/pssymbfact_distdata.c | 149 +- SRC/psutil.c | 30 +- SRC/psymbfact.c | 2 +- SRC/pzdistribute.c | 83 +- SRC/pzgssvx.c | 55 +- SRC/pzgstrf.c | 32 +- SRC/pzgstrf3d.c | 2 +- SRC/pzgstrs.c | 1519 ++++++++--------- SRC/pzgstrs1.c | 47 +- SRC/pzgstrs_Bglobal.c | 38 +- SRC/pzgstrs_lsum.c | 82 +- SRC/pzsymbfact_distdata.c | 119 +- SRC/sdistribute.c | 125 +- SRC/sldperm_dist.c | 21 +- SRC/slustruct_gpu.h | 4 +- SRC/ssp_blas2_dist.c | 2 +- SRC/ssuperlu_gpu.cu | 14 +- SRC/superlu_ddefs.h | 164 +- SRC/superlu_defs.h | 67 +- SRC/superlu_dist_config.h | 2 +- SRC/superlu_enum_consts.h | 6 +- SRC/superlu_gpu_utils.cu | 1 - SRC/superlu_zdefs.h | 140 +- SRC/supernodal_etree.c | 2 +- SRC/sutil_dist.c | 14 +- SRC/util.c | 30 +- SRC/zdistribute.c | 96 +- SRC/zldperm_dist.c | 16 +- SRC/zlustruct_gpu.h | 4 +- SRC/zsp_blas2_dist.c | 147 +- SRC/zsuperlu_gpu.cu | 33 +- SRC/zutil_dist.c | 10 +- example_scripts/run_cmake_build_debug.sh | 2 +- .../run_cmake_build_summit_gcc_gpu.sh | 2 +- .../run_cmake_build_summit_gcc_gpu_10.sh | 2 +- .../run_cmake_build_summit_gcc_nogpu.sh | 2 +- 71 files changed, 2327 insertions(+), 2159 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 391f4adc..04b731eb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -77,7 +77,7 @@ jobs: -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \ -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \ -DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpic++ \ diff --git a/.travis.yml b/.travis.yml index 3e758231..21c33df3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -98,7 +98,7 @@ install: -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \ -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \ -DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpic++ \ diff --git a/CMakeLists.txt b/CMakeLists.txt index fb7295b9..b5e7bca0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,9 +194,10 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda # find_package(CUB REQUIRED) find_package(CUDAToolkit REQUIRED) - if(CUDAToolkit_FOUND) - target_link_libraries(superlu PUBLIC CUDA::cudart CUDA::cusolver CUDA::cublas) - endif() +# The following appears in SRC/CMakeLists.txt +# if(CUDAToolkit_FOUND) +# target_link_libraries(superlu_dist PUBLIC CUDA::cudart CUDA::cusolver CUDA::cublas) +# endif() message("-- CUDAToolkit_LIBRARY_ROOT='${CUDAToolkit_LIBRARY_ROOT}'") if (NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "") set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index af27df66..9373fa10 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -197,14 +197,10 @@ int main(int argc, char *argv[]) options.DiagInv = NO; */ set_default_options_dist(&options); - // options.IterRefine = NOREFINE; - // options.DiagInv = YES; - // options.ReplaceTinyPivot = YES; - - // options.Equil = NO; - // options.ColPerm = NATURAL; - // options.RowPerm = NOROWPERM; + options.ParSymbFact = NO; + options.ColPerm = PARMETIS; #if 0 + options.ReplaceTinyPivot = YES; options.RowPerm = LargeDiag_HWPM; options.RowPerm = NOROWPERM; options.IterRefine = NOREFINE; diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 493997b2..35a1d2c9 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -196,6 +196,8 @@ int main(int argc, char *argv[]) options.DiagInv = NO; */ set_default_options_dist(&options); + options.ParSymbFact = YES; + options.ColPerm = PARMETIS; #if 0 options.RowPerm = NOROWPERM; options.IterRefine = NOREFINE; diff --git a/SRC/comm_tree.c b/SRC/comm_tree.c index 0c6741a5..8918cab4 100644 --- a/SRC/comm_tree.c +++ b/SRC/comm_tree.c @@ -20,10 +20,16 @@ tree->empty_= NO; // non-empty if rank_cnt>1 if(precision=='d'){ MPI_Type_contiguous( sizeof(double), MPI_BYTE, &tree->type_ ); + } + if(precision=='s'){ + MPI_Type_contiguous( sizeof(float), MPI_BYTE, &tree->type_ ); } if(precision=='z'){ MPI_Type_contiguous( sizeof(doublecomplex), MPI_BYTE, &tree->type_ ); } + //if(precision=='c'){ + //MPI_Type_contiguous( sizeof(complex), MPI_BYTE, &tree->type_ ); + //} MPI_Type_commit( &tree->type_ ); int myIdx = 0; diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c index ff0034fc..aa51f729 100644 --- a/SRC/communication_aux.c +++ b/SRC/communication_aux.c @@ -192,7 +192,7 @@ int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) /* * The following are from trfCommWrapper.c. */ -int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, +int Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, MPI_Request *L_diag_blk_send_req, gridinfo_t *grid, SCT_t *SCT) { @@ -200,9 +200,9 @@ int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, // LocalLU_t *Llu = LUstruct->Llu; // int_t* xsup = Glu_persist->xsup; - int_t iam = grid->iam; + int iam = grid->iam; - int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); if (iam == pkk) { diff --git a/SRC/ddistribute.c b/SRC/ddistribute.c index fdb2e823..27ec4138 100644 --- a/SRC/ddistribute.c +++ b/SRC/ddistribute.c @@ -135,19 +135,20 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A, int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; + int *fmod; /* Modification count for L-solve. */ + int **fsendx_plist; /* Column process list to send down Xk. */ + int nfrecvx = 0; /* Number of Xk I will receive. */ + int nfsendx = 0; /* Number of Xk I will send */ + int kseen; /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ + int *bmod; /* Modification count for U-solve. */ + int **bsendx_plist; /* Column process list to send down Xk. */ + int nbrecvx = 0; /* Number of Xk I will receive. */ + int nbsendx = 0; /* Number of Xk I will send */ + + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ @@ -173,8 +174,9 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A, int_t iword, dword; float mem_use = 0.0; - int_t *mod_bit; - int_t *frecv, *brecv, *lloc; + int *mod_bit; + int *frecv, *brecv; + int_t *lloc; double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ @@ -374,7 +376,7 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) - ABORT("Malloc fails for index[]."); + ABORT("Malloc fails for index1[]."); mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; @@ -534,9 +536,9 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) + if ( !(fmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) + if ( !(bmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; @@ -598,23 +600,23 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Malloc fails for Unnz[]."); /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + fsendx_plist[i] = &index1[j]; + if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; + bsendx_plist[i] = &index1[j]; - mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*sizeof(int); /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. @@ -1346,9 +1348,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); /* construct the Reduce tree for L ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) + if ( !(frecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -1363,7 +1365,7 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); @@ -1678,9 +1680,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); /* construct the Reduce tree for U ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(brecv = intMalloc_dist(nlb)) ) + if ( !(brecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -1695,7 +1697,7 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); @@ -2022,7 +2024,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + if ( !(Llu->mod_bit = int32Malloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ diff --git a/SRC/dldperm_dist.c b/SRC/dldperm_dist.c index 1c43eac8..656d1128 100644 --- a/SRC/dldperm_dist.c +++ b/SRC/dldperm_dist.c @@ -22,9 +22,10 @@ at the top-level directory. #include "superlu_ddefs.h" -extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], - int_t*, int_t [], int_t*, int_t[], int_t*, double [], - int_t [], int_t []); +extern int mc64ad_dist(int *job, int *n, int_t *ne, int_t *ip, + int_t *irn, double *a, int *num, int_t *cperm, + int_t *liw, int_t *iw, int_t *ldw, double *dw, + int * icntl, int *info); /*! \brief * @@ -92,11 +93,12 @@ extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], */ int -dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], +dldperm_dist(int job, int n, int_t nnz, int_t colptr[], int_t adjncy[], double nzval[], int_t *perm, double u[], double v[]) { - int_t i, liw, ldw, num; - int_t *iw, icntl[10], info[10]; + int i, num, icntl[10], info[10]; + int_t liw, ldw; + int_t *iw; double *dw; extern double *doubleMalloc_dist(int_t); @@ -147,7 +149,7 @@ dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); #endif if ( info[0] == 1 ) { /* Structurally singular */ - printf(".. The last " IFMT " permutations:\n", n-num); + printf(".. The last %d permutations:\n", n-num); PrintInt10("perm", n-num, &perm[num]); } diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index f2d80bd9..85d25aac 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -19,7 +19,6 @@ #ifdef GPU_ACC // enable GPU #include "gpu_api_utils.h" - // #include "mkl.h" // #include "sec_structs.h" // #include "supernodal_etree.h" @@ -120,9 +119,9 @@ typedef struct //LUstruct_gpu_ double tHost_PCIeH2D; double tHost_PCIeD2H; - /*gpu events to measure DGEMM and SCATTER timing */ + /*GPU events to measure DGEMM and SCATTER timing */ int *isOffloaded; /*stores if any iteration is offloaded or not*/ - gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/ gpuEvent_t *ePCIeH2D; gpuEvent_t *ePCIeD2H_Start; gpuEvent_t *ePCIeD2H_End; diff --git a/SRC/dsp_blas2_dist.c b/SRC/dsp_blas2_dist.c index 657e0c20..d8ceb6a6 100644 --- a/SRC/dsp_blas2_dist.c +++ b/SRC/dsp_blas2_dist.c @@ -8,6 +8,8 @@ All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + + /*! @file * \brief Sparse BLAS 2, using some dense BLAS 2 operations * @@ -19,7 +21,7 @@ at the top-level directory. */ /* - * File name: sp_blas2.c + * File name: dsp_blas2_dist.c * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. */ @@ -146,10 +148,8 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc; luptr = SuperLU_L_NZ_START(fsupc); nrow = nsupr - nsupc; - solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; - if ( nsupc == 1 ) { for (iptr=istart+1; iptr < SuperLU_L_SUB_START(fsupc+1); ++iptr) { irow = SuperLU_L_SUB(iptr); @@ -175,9 +175,9 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1); #endif /* _CRAY */ #else - dlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); + dlsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]); - dmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], + dmatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif @@ -186,7 +186,6 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, irow = SuperLU_L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; - } } } /* for k ... */ @@ -201,7 +200,6 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc); nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc; luptr = SuperLU_L_NZ_START(fsupc); - solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { @@ -210,6 +208,7 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, irow = SuperLU_U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } + } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY @@ -251,7 +250,6 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, luptr = SuperLU_L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; - for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = SuperLU_L_NZ_START(jcol) + nsupc; @@ -264,6 +262,7 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); + #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); @@ -300,7 +299,6 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, } solve_ops += nsupc * (nsupc + 1); - if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { @@ -327,11 +325,10 @@ sp_dtrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, /*SuperLUStat.ops[SOLVE] += solve_ops;*/ SUPERLU_FREE(work); return 0; -} - +} /* sp_dtrsv_dist */ -/*! \brief +/*! \brief SpGEMV
   Purpose   
     =======   
@@ -399,10 +396,12 @@ sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x,
     NCformat *Astore;
     double   *Aval;
     int info;
-    double temp;
+    double temp, temp1;
     int lenx, leny, i, j, irow;
     int iy, jx, jy, kx, ky;
     int notran;
+    double zero = 0.0;
+    double one = 1.0;
 
     notran = (strncmp(trans, "N", 1)==0);
     Astore = (NCformat *) A->Store;
@@ -421,7 +420,7 @@ sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x,
     }
 
     /* Quick return if possible. */
-    if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.)
+    if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
 	return 0;
 
     /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
@@ -444,14 +443,14 @@ sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x,
     if (beta != 1.) {
 	if (incy == 1) {
 	    if (beta == 0.)
-		for (i = 0; i < leny; ++i) y[i] = 0.;
+		for (i = 0; i < leny; ++i) y[i] = zero;
 	    else
 		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
 	} else {
 	    iy = ky;
 	    if (beta == 0.)
 		for (i = 0; i < leny; ++i) {
-		    y[iy] = 0.;
+		    y[iy] = zero;
 		    iy += incy;
 		}
 	    else
@@ -486,7 +485,7 @@ sp_dgemv_dist(char *trans, double alpha, SuperMatrix *A, double *x,
 	jy = ky;
 	if (incx == 1) {
 	    for (j = 0; j < A->ncol; ++j) {
-		temp = 0.;
+		temp = zero;
 		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
 		    irow = Astore->rowind[i];
 		    temp += Aval[i] * x[irow];
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
index 97300e2d..2f26b421 100644
--- a/SRC/dsuperlu_gpu.cu
+++ b/SRC/dsuperlu_gpu.cu
@@ -18,20 +18,10 @@
 
 #undef Reduce
 
-//#include 
+//#include 
 
 #include "dlustruct_gpu.h"
 
-/* Sherry - following is precision-independent file, used by both double
-   and complex. It should not included in this source code.
-   I updated CMakeLists.txt to include this for HAVE_CUDA or HAVE_HIP.  */
-#if 0
-#ifdef HAVE_CUDA
-#include "superlu_gpu_utils.cu"
-#elif defined(HAVE_HIP)
-#include "superlu_gpu_utils.hip.cpp"
-#endif
-#endif
 
 //extern "C" {
 //	void cblas_daxpy(const int N, const double alpha, const double *X,
@@ -43,7 +33,7 @@
 // #if defined(DEBUG) || defined(_DEBUG)
 // 	if (result != GPUBLAS_STATUS_SUCCESS)
 // 	{
-// 		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result));
+// 		fprintf(stderr, "GPU BLAS Runtime Error: %s\n", gpublasGetErrorString(result));
 // 		assert(result == GPUBLAS_STATUS_SUCCESS);
 // 	}
 // #endif
@@ -233,7 +223,7 @@ void Scatter_GPU_kernel(
 
 	typedef int pfx_dtype ;
         extern  __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n);
-	
+
 	double *tempv1;
 	if (jj_st == jj0)
 	{
@@ -876,7 +866,7 @@ int dinitSluGPU3D_t(
     int_t ldt             /* NSUP read from sp_ienv(3) */
 )
 {
-    (gpuDeviceReset ())     ;
+    checkGPUErrors(gpuDeviceReset ());
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     dLocalLU_t *Llu = LUstruct->Llu;
     int* isNodeInMyGrid = sluGPU->isNodeInMyGrid;
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index 0fd6d9a2..d208bbc7 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -605,11 +605,11 @@ void dPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 	}
 	printf("(%d)", iam);
  	PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]);
-	PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
+	PrintInt32("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]);
     }
-    printf("nfrecvx " IFMT "\n", Llu->nfrecvx);
+    printf("nfrecvx %d\n", Llu->nfrecvx);
     k = CEILING( nsupers, grid->nprow );
-    PrintInt10("fmod", k, Llu->fmod);
+    PrintInt32("fmod", k, Llu->fmod);
 
 } /* DPRINTLBLOCKS */
 
diff --git a/SRC/gpu_api_utils.h b/SRC/gpu_api_utils.h
index 076e2d03..cb0e4dc7 100644
--- a/SRC/gpu_api_utils.h
+++ b/SRC/gpu_api_utils.h
@@ -29,7 +29,7 @@ extern "C" {
 #endif
 extern void DisplayHeader();
 extern const char* cublasGetErrorString(cublasStatus_t status);
-extern cudaError_t checkCuda(cudaError_t);
+extern gpuError_t checkGPU(gpuError_t);
 extern cublasStatus_t checkCublas(cublasStatus_t);
 extern cublasHandle_t create_handle ();
 extern void destroy_handle (cublasHandle_t handle);
@@ -38,3 +38,4 @@ extern void destroy_handle (cublasHandle_t handle);
 #endif
 
 #endif 
+#endif 
diff --git a/SRC/gpu_wrapper.h b/SRC/gpu_wrapper.h
index c3bab57d..ffb30642 100644
--- a/SRC/gpu_wrapper.h
+++ b/SRC/gpu_wrapper.h
@@ -62,7 +62,9 @@
 #define  gpublasHandle_t cublasHandle_t
 #define  gpublasSetStream cublasSetStream
 #define  gpublasDgemm cublasDgemm
+#define  gpublasSgemm cublasSgemm
 #define  gpublasZgemm cublasZgemm
+#define  gpublasCgemm cublasCgemm
 #define  GPUBLAS_OP_N CUBLAS_OP_N
 #define  gpuDoubleComplex cuDoubleComplex
 #define  gpuRuntimeGetVersion cudaRuntimeGetVersion
@@ -140,7 +142,9 @@
 #define  gpublasHandle_t hipblasHandle_t
 #define  gpublasSetStream hipblasSetStream
 #define  gpublasDgemm hipblasDgemm
+#define  gpublasSgemm hipblasSgemm
 #define  gpublasZgemm hipblasZgemm
+#define  gpublasCgemm hipblasCgemm
 #define  GPUBLAS_OP_N HIPBLAS_OP_N
 #define  gpuDoubleComplex hipblasDoubleComplex
 #define  gpuRuntimeGetVersion hipRuntimeGetVersion
diff --git a/SRC/mc64ad_dist.c b/SRC/mc64ad_dist.c
index 9b663118..7a83b10b 100644
--- a/SRC/mc64ad_dist.c
+++ b/SRC/mc64ad_dist.c
@@ -54,9 +54,9 @@ static int_t c__2 = 2;
 /* 
*/ -/* Subroutine */ int_t mc64id_dist(int_t *icntl) +/* Subroutine */ int mc64id_dist(int *icntl) { - int_t i__; + int i__; /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ @@ -118,10 +118,10 @@ static int_t c__2 = 2; } /* mc64id_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64ad_dist(int_t *job, int_t *n, int_t *ne, int_t * - ip, int_t *irn, double *a, int_t *num, int_t *cperm, - int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t * - icntl, int_t *info) +/* Subroutine */ int mc64ad_dist(int *job, int *n, int_t *ne, int_t * + ip, int_t *irn, double *a, int *num, int_t *cperm, + int_t *liw, int_t *iw, int_t *ldw, double *dw, + int * icntl, int *info) { /* System generated locals */ int_t i__1, i__2; @@ -134,17 +134,17 @@ static int_t c__2 = 2; int_t i__, j, k; double fact, rinf; - extern /* Subroutine */ int_t mc21ad_dist(int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, int_t *, int_t *), - mc64bd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t - *, int_t *, int_t *, int_t *, int_t *, int_t *, double *), - mc64rd_dist(int_t *, int_t *, int_t *, int_t *, double *), - mc64sd_dist(int_t *, int_t *, int_t *, int_t * - , double *, int_t *, int_t *, int_t *, int_t *, + extern /* Subroutine */ int mc21ad_dist(int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *), - mc64wd_dist(int_t *, int_t *, int_t *, int_t *, double *, int_t - *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t - *, double *, double *); + mc64bd_dist(int *, int_t *, int_t *, int_t *, double *, int_t + *, int *, int_t *, int_t *, int_t *, int_t *, double *), + mc64rd_dist(int *, int_t *, int_t *, int_t *, double *), + mc64sd_dist(int *, int_t *, int_t *, int_t *, + double *, int_t *, int *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *), + mc64wd_dist(int *, int_t *, int_t *, int_t *, double *, int_t *, + int*, int_t *, int_t *, int_t *, int_t *, int_t + *, double *, double *); /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ /* Research Councils *** */ @@ -349,8 +349,8 @@ static int_t c__2 = 2; info[1] = -1; info[2] = *job; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " because JOB = " IFMT "\n", info[1], *job); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " because JOB = %d\n", info[1], info[2]); } goto L99; } @@ -359,8 +359,8 @@ static int_t c__2 = 2; info[1] = -2; info[2] = *n; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " because N = " IFMT "\n", info[1], *job); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " because N = %d \n", info[1], info[2]); } goto L99; } @@ -369,8 +369,8 @@ static int_t c__2 = 2; info[1] = -3; info[2] = *ne; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " because NE = " IFMT "\n", info[1], *job); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " because NE = %d \n", info[1], info[2]); } goto L99; } @@ -394,8 +394,8 @@ static int_t c__2 = 2; info[1] = -4; info[2] = k; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " LIW too small, must be at least " IFMT "\n", info[1], k); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " LIW too small, must be at least %d\n", info[1], (int)k); } goto L99; } @@ -418,8 +418,8 @@ static int_t c__2 = 2; info[1] = -5; info[2] = k; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " LDW too small, must be at least " IFMT "\n", info[1], k); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " LDW too small, must be at least %d\n", info[1], (int)k); } goto L99; } @@ -441,10 +441,10 @@ static int_t c__2 = 2; info[1] = -6; info[2] = j; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " Column " IFMT - " contains an entry with invalid row index " IFMT "\n", - info[1], j, i__); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " Column %d" + " contains an entry with invalid row index %d\n", + info[1], (int)j, (int)i__); } goto L99; } @@ -453,10 +453,10 @@ static int_t c__2 = 2; info[1] = -7; info[2] = j; if (icntl[1] >= 0) { - printf(" ****** Error in MC64A/AD. INFO(1) = " IFMT - " Column " IFMT - " contains two or more entries with row index " IFMT "\n", - info[1], j, i__); + printf(" ****** Error in MC64A/AD. INFO(1) = %d " + " Column %d" + " contains two or more entries with row index %d\n", + info[1], (int)j, (int)i__); } goto L99; } else { @@ -469,16 +469,16 @@ static int_t c__2 = 2; } /* Print diagnostics on input */ if (icntl[3] >= 0) { - printf(" ****** Input parameters for MC64A/AD: JOB = " IFMT "," - " N = " IFMT ", NE = " IFMT "\n", *job, *n, *ne); + printf(" ****** Input parameters for MC64A/AD: JOB = %d ," + " N = %d, NE = %d\n", *job, *n, (int)*ne); printf(" IP(1:N+1) = "); for (j=1; j<=(*n+1); ++j) { - printf(IFMT, ip[j]); + printf("%d ", (int) ip[j]); if (j%8 == 0) printf("\n"); } printf("\n IRN(1:NE) = "); for (j=1; j<=(*ne); ++j) { - printf(IFMT, irn[j]); + printf("%d ", (int) irn[j]); if (j%8 == 0) printf("\n"); } printf("\n"); @@ -626,25 +626,25 @@ static int_t c__2 = 2; /* Matrix is structurally singular, return with warning */ info[1] = 1; if (icntl[2] >= 0) { - printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT + printf(" ****** Warning from MC64A/AD. INFO(1) = %d " " The matrix is structurally singular.\n", info[1]); } } if (info[1] == 2) { /* Scaling factors are large, return with warning */ if (icntl[2] >= 0) { - printf(" ****** Warning from MC64A/AD. INFO(1) = " IFMT "\n" + printf(" ****** Warning from MC64A/AD. INFO(1) = %d\n" " Some scaling factors may be too large.\n", info[1]); } } /* Print diagnostics on output */ if (icntl[3] >= 0) { - printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = " IFMT IFMT "\n", + printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = %d %d\n", info[1], info[2]); - printf(" NUM = " IFMT, *num); + printf(" NUM = %d", *num); printf(" CPERM(1:N) = "); for (j=1; j<=*n; ++j) { - printf(IFMT, cperm[j]); + printf("%d ", (int) cperm[j]); if (j%8 == 0) printf("\n"); } if (*job == 5) { @@ -667,8 +667,8 @@ static int_t c__2 = 2; } /* mc64ad_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64bd_dist(int_t *n, int_t *ne, int_t *ip, int_t * - irn, double *a, int_t *iperm, int_t *num, int_t *jperm, +/* Subroutine */ int mc64bd_dist(int *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int *num, int_t *jperm, int_t *pr, int_t *q, int_t *l, double *d__) { /* System generated locals */ @@ -690,10 +690,10 @@ static int_t c__2 = 2; double dnew; int_t jord, qlen, idum, jdum; double rinf; - extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, - double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *, + extern /* Subroutine */ int mc64dd_dist(int_t *, int *, int_t *, + double *, int_t *, int_t *), mc64ed_dist(int_t *, int *, int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t * - , int_t *, int_t *, int_t *, double *, int_t *, int_t *); + , int_t *, int *n, int_t *, double *, int_t *, int_t *); /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ @@ -1094,7 +1094,7 @@ static int_t c__2 = 2; } /* mc64bd_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64dd_dist(int_t *i__, int_t *n, int_t *q, double +/* Subroutine */ int mc64dd_dist(int_t *i__, int *n, int_t *q, double *d__, int_t *l, int_t *iway) { /* System generated locals */ @@ -1172,7 +1172,7 @@ static int_t c__2 = 2; } /* mc64dd_dist */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64ed_dist(int_t *qlen, int_t *n, int_t *q, +/* Subroutine */ int mc64ed_dist(int_t *qlen, int *n, int_t *q, double *d__, int_t *l, int_t *iway) { /* System generated locals */ @@ -1267,7 +1267,7 @@ static int_t c__2 = 2; } /* mc64ed_dist */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64fd_dist(int_t *pos0, int_t *qlen, int_t *n, +/* Subroutine */ int mc64fd_dist(int_t *pos0, int_t *qlen, int *n, int_t *q, double *d__, int_t *l, int_t *iway) { /* System generated locals */ @@ -1405,8 +1405,8 @@ static int_t c__2 = 2; } /* mc64fd_dist */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64rd_dist(int_t *n, int_t *ne, int_t *ip, - int_t *irn, double *a) +/* Subroutine */ int mc64rd_dist(int *n, int_t *ne, int_t *ip, + int_t *irn, double *a) { /* System generated locals */ int_t i__1, i__2, i__3; @@ -1553,8 +1553,8 @@ static int_t c__2 = 2; } /* mc64rd_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64sd_dist(int_t *n, int_t *ne, int_t *ip, int_t * - irn, double *a, int_t *iperm, int_t *numx, int_t *w, +/* Subroutine */ int mc64sd_dist(int *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int *numx, int_t *w, int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, int_t *iw4) { @@ -1562,13 +1562,14 @@ static int_t c__2 = 2; int_t i__1, i__2, i__3, i__4; /* Local variables */ - int_t i__, j, k, l, ii, mod, cnt, num; + int_t i__, j, k, l, ii, mod, cnt; + int num; double bval, bmin, bmax, rinf; int_t nval, wlen, idum1, idum2, idum3; - extern /* Subroutine */ int_t mc64qd_dist(int_t *, int_t *, int_t *, + extern /* Subroutine */ int mc64qd_dist(int_t *, int_t *, int_t *, int_t *, int_t *, double *, int_t *, double *), - mc64ud_dist(int_t *, int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, + mc64ud_dist(int_t *, int_t *, int *n, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int *num, int *numx, int_t *, int_t *, int_t *, int_t *); /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ @@ -1614,7 +1615,7 @@ static int_t c__2 = 2; /* CNT is the number of calls made to MC64U/UD so far. */ /* NUM is the cardinality of last matching found. */ /* Set RINF to largest positive real number */ -/* XSL RINF = FD05AD(5) */ +/* Sherry RINF = FD05AD(5) */ /* Parameter adjustments */ --iw4; --iw; @@ -1870,7 +1871,7 @@ static int_t c__2 = 2; } /* mc64sd_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64qd_dist(int_t *ip, int_t *lenl, int_t *lenh, +/* Subroutine */ int mc64qd_dist(int_t *ip, int_t *lenl, int_t *lenh, int_t *w, int_t *wlen, double *a, int_t *nval, double *val) { /* System generated locals */ @@ -1964,9 +1965,9 @@ static int_t c__2 = 2; } /* mc64qd_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64ud_dist(int_t *id, int_t *mod, int_t *n, int_t * +/* Subroutine */ int mc64ud_dist(int_t *id, int_t *mod, int *n, int_t * irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t * - iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, + iperm, int *num, int *numx, int_t *pr, int_t *arp, int_t *cv, int_t *out) { /* System generated locals */ @@ -2167,8 +2168,8 @@ static int_t c__2 = 2; } /* mc64ud_ */ /* ********************************************************************** */ -/* Subroutine */ int_t mc64wd_dist(int_t *n, int_t *ne, int_t *ip, int_t * - irn, double *a, int_t *iperm, int_t *num, int_t *jperm, +/* Subroutine */ int mc64wd_dist(int *n, int_t *ne, int_t *ip, int_t * + irn, double *a, int_t *iperm, int *num, int_t *jperm, int_t *out, int_t *pr, int_t *q, int_t *l, double *u, double *d__) { @@ -2188,10 +2189,10 @@ static int_t c__2 = 2; double dmin__, dnew; int_t jord, qlen, jdum; double rinf; - extern /* Subroutine */ int_t mc64dd_dist(int_t *, int_t *, int_t *, - double *, int_t *, int_t *), mc64ed_dist(int_t *, int_t *, + extern /* Subroutine */ int mc64dd_dist(int_t *, int *n, int_t *, + double *, int_t *, int_t *), mc64ed_dist(int_t *, int *, int_t *, double *, int_t *, int_t *), mc64fd_dist(int_t * - , int_t *, int_t *, int_t *, double *, int_t *, + , int_t *, int *, int_t *, double *, int_t *, int_t *); diff --git a/SRC/memory.c b/SRC/memory.c index bee42f8f..6e8d4d2a 100644 --- a/SRC/memory.c +++ b/SRC/memory.c @@ -199,6 +199,23 @@ user_bcopy(char *src, char *dest, int_t bytes) +int *int32Malloc_dist(int n) +{ + int *buf; + buf = (int *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int)); + return (buf); +} + +int *int32Calloc_dist(int n) +{ + int *buf; + register int i; + buf = (int *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int)); + if ( buf ) + for (i = 0; i < n; ++i) buf[i] = 0; + return (buf); +} + int_t *intMalloc_dist(int_t n) { int_t *buf; diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c index 8d4add54..d9d7d3ab 100644 --- a/SRC/pddistribute.c +++ b/SRC/pddistribute.c @@ -447,19 +447,20 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; + int *fmod; /* Modification count for L-solve. */ + int **fsendx_plist; /* Column process list to send down Xk. */ + int nfrecvx = 0; /* Number of Xk I will receive. */ + int nfsendx = 0; /* Number of Xk I will send */ + int kseen; /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ + int *bmod; /* Modification count for U-solve. */ + int **bsendx_plist; /* Column process list to send down Xk. */ + int nbrecvx = 0; /* Number of Xk I will receive. */ + int nbsendx = 0; /* Number of Xk I will send */ + + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ @@ -486,8 +487,9 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, float mem_use = 0.0; float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ - int_t *mod_bit; - int_t *frecv, *brecv, *lloc; + int *mod_bit; + int *frecv, *brecv; + int_t *lloc; double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ @@ -857,9 +859,9 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) + if ( !(fmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) + if ( !(bmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); /* ------------------------------------------------ */ @@ -926,21 +928,21 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + fsendx_plist[i] = &index1[j]; + if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; + bsendx_plist[i] = &index1[j]; /* -------------------------------------------------------------- */ mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr @@ -1686,9 +1688,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); /* construct the Reduce tree for L ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) + if ( !(frecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -1703,8 +1705,7 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - + MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ @@ -2024,9 +2025,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); /* construct the Reduce tree for U ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(brecv = intMalloc_dist(nlb)) ) + if ( !(brecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -2041,7 +2042,7 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); @@ -2382,7 +2383,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); MPI_MAX, grid->comm); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + if ( !(Llu->mod_bit = int32Malloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); #if ( PROFlevel>=1 ) diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index 4f5ba03a..7450ea56 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -29,9 +29,7 @@ at the top-level directory. #include #include "superlu_ddefs.h" -#ifdef GPU_ACC -#include "gpu_api_utils.h" -#endif + /*! \brief * *
@@ -530,19 +528,20 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	      routine. They will be freed after PDDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
 	      structures are not used.                                  */
-    fact_t   Fact;
-    double   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam,iam_g;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
+    fact_t  Fact;
+    double *a;
+    int_t   *colptr, *rowind;
+    int_t   *perm_r; /* row permutations from partial pivoting */
+    int_t   *perm_c; /* column permutation vector */
+    int_t   *etree;  /* elimination tree */
+    int_t   *rowptr, *colind;  /* Local A in NR*/
+    int_t   nnz_loc, nnz, iinfo;
+    int     m_loc, fst_row, icol;
+    int     colequ, Equil, factored, job, notran, rowequ, need_value;
+    int     i, j, irow, m, n, permc_spec;
+    int     iam, iam_g;
+    int     ldx;  /* LDA for matrix X (local). */
+    char    equed[1], norm[1];
     double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
     double   *X, *b_col, *b_work, *x_col;
     double   t;
@@ -721,11 +720,11 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    if ( iinfo > 0 ) {
 		if ( iinfo <= m ) {
 #if ( PRNTlevel>=1 )
-		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+		    fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
 #endif
 		} else {
 #if ( PRNTlevel>=1 )
-                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+                    fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
 #endif
                 }
  	    } else if ( iinfo < 0 ) return;
@@ -929,7 +928,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        stat->utime[ROWPERM] = t;
 #if ( PRNTlevel>=1 )
                 if ( !iam ) {
-		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    printf(".. LDPERM job %d\t time: %.2f\n", job, t);
 		    fflush(stdout);
 		}
 #endif
@@ -1403,29 +1402,29 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	       factorization with Fact == DOFACT or SamePattern is asked for. */
 	}
 
-#ifdef GPU_ACC	
-	if(options->DiagInv==NO){
-	printf("GPU trisolve requires setting options->DiagInv==YES\n");
-	exit(0);
+#ifdef GPU_ACC
+        if(options->DiagInv==NO){
+	    printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n");
+	    fflush(stdout);
+	    //exit(0);  // Sherry: need to return an error flag
 	}
 #endif
 
-	if ( options->DiagInv==YES &&
-             (options->Fact == DOFACT || Fact == SamePattern ||
-              Fact == SamePattern_SameRowPerm) ) {
+	if ( options->DiagInv==YES && (Fact != FACTORED) ) {
 	    pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
-#ifdef GPU_ACC		
-		checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));	
-		checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));	
-		checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));	
-		checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat, (LUstruct->Llu->Unzval_br_cnt) * sizeof(double), gpuMemcpyHostToDevice));	
+#ifdef GPU_ACC
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+	        (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+	        (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+	        (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat,
+	        (LUstruct->Llu->Unzval_br_cnt) * sizeof(double), gpuMemcpyHostToDevice));
 #endif
-
 	}
 
 
-
-
     // #pragma omp parallel
     // {
 	// #pragma omp master
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index 40e6021b..dc1b9747 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -111,8 +111,8 @@ at the top-level directory.
 #include 
 #include "superlu_ddefs.h"
 #include "gpu_api_utils.h"
-#ifdef GPU_ACC
 
+#ifdef GPU_ACC
 // #define NUM_GPU_STREAMS 16
 // #define NUM_GPU_STREAMS 16
 #endif
@@ -384,9 +384,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     gemm_profile *gemm_stats;
 #endif
 
-// cudaProfilerStart();
-
-
     /* Test the input parameters. */
     *info = 0;
     if (m < 0)
@@ -775,7 +772,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
                                      //   get_max_buffer_size());
     /* array holding last column blk for each partition,
-       used in SchCompUdt--GPU.c         */
+       used in SchCompUdt-GPU.c         */
   #if 0
     int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
   #else
@@ -819,9 +816,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
         printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
                 max_row_size, max_ncols);
+        printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
+        fflush(stdout);
     }
-    printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
-    fflush(stdout);
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
@@ -834,17 +831,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
-            iam, bigv_size, buffer_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+                iam, bigv_size, buffer_size);
+        fflush(stdout);
+    }
 #endif
-    if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(double) ,gpuHostMallocDefault)) )
+
+    if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(double), gpuHostMallocDefault)) )
         ABORT("Malloc fails for dgemm buffer V");
 
 #if ( PRNTlevel>=1 )
     if ( iam==0 ) {
         DisplayHeader();
-	printf(" Starting with %d GPU Streams \n",nstreams );
+	printf(" Starting with %d GPU Streams \n", nstreams);
         fflush(stdout);
     }
 #endif
@@ -876,14 +876,13 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 
     // size of B should be bigu_size
-
     gpuStat = gpuMalloc((void**)&dB, bigu_size * sizeof(double));
     if (gpuStat!= gpuSuccess) {
         fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(double));
         return 1;
     }
 
-    gpuStat = gpuMalloc((void**)&dC, buffer_size* sizeof(double) );
+    gpuStat = gpuMalloc((void**)&dC, buffer_size * sizeof(double) );
     if (gpuStat!= gpuSuccess) {
         fprintf(stderr, "!!!! Error in allocating C in the device \n" );
         return 1;
@@ -901,8 +900,10 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
+        fflush(stdout);
+    }
 #endif
 
 //#ifdef __INTEL_COMPILER
@@ -1932,7 +1933,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     /* Prepare error message - find the smallesr index i that U(i,i)==0 */
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
 
 #if ( PROFlevel>=1 )
@@ -2003,9 +2004,6 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     CHECK_MALLOC (iam, "Exit pdgstrf()");
 #endif
 
-
-// cudaProfilerStop();
-
     return 0;
 } /* PDGSTRF */
 
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index 6c5df8a4..55339705 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -357,7 +357,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int iinfo;
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
     //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
 
diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c
index d35d326a..cbfa4c6f 100644
--- a/SRC/pdgstrs.c
+++ b/SRC/pdgstrs.c
@@ -135,7 +135,7 @@ dreadMM_dist_intoL_CSR(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
@@ -176,7 +176,7 @@ dreadMM_dist_intoL_CSR(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      /* 3/ Read n and nnz */
 #ifdef _LONGINT
-    sscanf(line, "%ld%ld%ld",m, n, nonz);
+    sscanf(line, "%lld%lld%lld",m, n, nonz);
 #else
     sscanf(line, "%d%d%d",m, n, nonz);
 #endif
@@ -1084,34 +1084,34 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct,
 
     double tmax;
     	/*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
+    int  *fmod;         /* Modification count for L-solve --
     			 Count the number of local block products to
     			 be summed into lsum[lk]. */
 	int_t *fmod_sort;
 	int_t *order;
-	int_t *order1;
-	int_t *order2;
-    int_t fmod_tmp;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  nfrecvx_buf=0;
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+	//int_t *order1;
+	//int_t *order2;
+    int fmod_tmp;
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  nfrecvx_buf=0;
+    int *frecv;        /* Count of lsum[lk] contributions to be received
     			 from processes in this row.
     			 It is only valid on the diagonal processes. */
-    int_t  frecv_tmp;
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nfrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nbrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-    int_t  nleaftmp = 0, nroottmp = 0;
+    int  frecv_tmp;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nfrecv = 0; /* Count of total messages to be recv'd. */
+    int  nbrecv = 0; /* Count of total messages to be recv'd. */
+    int  nleaf = 0, nroot = 0;
+    int  nleaftmp = 0, nroottmp = 0;
     int_t  msgsize;
         /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  bmod_tmp;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  nbrecvx_buf=0;
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for U-solve. */
+    int  bmod_tmp;
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  nbrecvx_buf=0;
+    int  *brecv;        /* Count of modifications to be recv'd from
     			 processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     int_t flagx,flaglsum,flag;
@@ -1124,7 +1124,7 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct,
 
     int_t gik,iklrow,fnz;
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
     int INFO, pad;
     int_t tmpresult;
 
@@ -1294,7 +1294,7 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS. */
-    if ( !(fmod = intMalloc_dist(nlb*aln_i)) )
+    if ( !(fmod = int32Malloc_dist(nlb*aln_i)) )
 	ABORT("Malloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i];
 	if ( !(fmod_sort = intCalloc_dist(nlb*2)) )
@@ -1308,7 +1308,7 @@ pdgstrs(int_t n, dLUstruct_t *LUstruct,
 
 	if ( !(order = intCalloc_dist(nlb)) )
 		ABORT("Calloc fails for order[].");
-	for (j=0;jfrecv = frecv;
 
@@ -2472,10 +2472,10 @@ thread_id=0;
 
 		/* Save the count to be altered so it can be used by
 		   subsequent call to PDGSTRS. */
-		if ( !(bmod = intMalloc_dist(nlb*aln_i)) )
+		if ( !(bmod = int32Malloc_dist(nlb*aln_i)) )
 			ABORT("Malloc fails for bmod[].");
 		for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i];
-		if ( !(brecv = intCalloc_dist(nlb)) )
+		if ( !(brecv = int32Calloc_dist(nlb)) )
 			ABORT("Calloc fails for brecv[].");
 		Llu->brecv = brecv;
 
diff --git a/SRC/pdgstrs1.c b/SRC/pdgstrs1.c
index f88441ea..b987ad50 100644
--- a/SRC/pdgstrs1.c
+++ b/SRC/pdgstrs1.c
@@ -109,7 +109,8 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
     int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
     int    iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
+    int    i, ii, j, k, lb, ljb, lk;
+    int_t  il, lptr, luptr;
     int_t  nb, nlb, nub, nsupers;
     int_t  *xsup, *lsub, *usub;
     int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
@@ -125,19 +126,19 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 #endif
 
     /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -145,7 +146,7 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -179,10 +180,10 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS1. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -249,11 +250,12 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
 	    }
 	}
-	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	/*PrintInt32("mod_bit", nlb, mod_bit);*/
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	//MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -528,10 +530,10 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS1. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -555,7 +557,11 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -583,8 +589,13 @@ void pdgstrs1(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 		if ( mycol != kcol && bmod[lk] )
 		    i = 1;  /* Contribution from non-diagonal process. */
 		else i = 0;
+#if 0 // Sherry		
 		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
 			   MPI_SUM, kcol, scp->comm );
+#else			   
+		MPI_Reduce( &i, &brecv[lk], 1, MPI_INT, MPI_SUM, kcol, scp->comm );
+#endif
+
 		if ( mycol == kcol ) { /* Diagonal process. */
 		    nbrecvmod += brecv[lk];
 		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
diff --git a/SRC/pdgstrs_Bglobal.c b/SRC/pdgstrs_Bglobal.c
index 436aaf91..9730ad72 100644
--- a/SRC/pdgstrs_Bglobal.c
+++ b/SRC/pdgstrs_Bglobal.c
@@ -134,19 +134,19 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 #endif
 
     /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -154,7 +154,7 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -189,10 +189,10 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -275,7 +275,11 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif	
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -564,10 +568,10 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -591,7 +595,11 @@ pdgstrs_Bglobal(int_t n, dLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c
index 9aa84033..13db8407 100644
--- a/SRC/pdgstrs_lsum.c
+++ b/SRC/pdgstrs_lsum.c
@@ -67,7 +67,7 @@ void dlsum_fmod
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t lptr,      /* Starting position in lsub[*].                      */
  int_t luptr,     /* Starting position in lusup[*].                     */
@@ -85,8 +85,8 @@ void dlsum_fmod
     int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel;
     int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -249,7 +249,7 @@ void dlsum_bmod
  double *xk,          /* X[k].                                          */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -274,8 +274,8 @@ void dlsum_bmod
     int_t  *lsub;
     double *lusup;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *brecv = Llu->brecv;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int  *brecv = Llu->brecv;
+    int    **bsendx_plist = Llu->bsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -355,7 +355,7 @@ void dlsum_bmod
 		    dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha,
 			   lusup, &nsupr, &x[ii], &iknsupc);
 #endif
-		    // stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
+		    stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs;
 #if ( DEBUGlevel>=2 )
 		    printf("(%2d) Solve X[%2d]\n", iam, gik);
 #endif
@@ -420,7 +420,7 @@ void dlsum_fmod_inv
  double *rtemp,   /* Result of full matrix-vector multiply.             */
  int   nrhs,      /* Number of right-hand sides.                        */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t *xsup,
  gridinfo_t *grid,
  dLocalLU_t *Llu,
@@ -443,8 +443,8 @@ void dlsum_fmod_inv
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	flops_t ops_loc=0.0;
@@ -461,9 +461,9 @@ void dlsum_fmod_inv
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = 1;//ceil(CACHELINE/(double)dword);
-	aln_i = 1;//ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 	int   knsupc;    /* Size of supernode k.                               */
 	int_t nlb;       /* Number of L blocks.                                */
 
@@ -709,10 +709,11 @@ void dlsum_fmod_inv
 
 #endif
 
-								/*
-								 * Send Xk to process column Pc[k].
-								 */
-								if(LBtree_ptr[lk].empty_==NO){
+							/*
+							 * Send Xk to process column Pc[k].
+							 */
+
+							if(LBtree_ptr[lk].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -888,14 +889,7 @@ void dlsum_fmod_inv
 					    dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha,
 									lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
-						}
-							// printf("\n");
-							// printf("k: %5d\n",ik);
-							// for (i=0 ; i=1 )
 					TOC(t2, t1);
@@ -909,11 +903,10 @@ void dlsum_fmod_inv
 #endif
 
 					/*
-						* Send Xk to process column Pc[k].
-						*/
+					 * Send Xk to process column Pc[k].
+					 */
 
 					if(LBtree_ptr[lk].empty_==NO){
-
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -968,7 +961,7 @@ void dlsum_fmod_inv_master
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t *xsup,
  gridinfo_t *grid,
@@ -990,8 +983,8 @@ void dlsum_fmod_inv_master
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	int m;
@@ -1009,9 +1002,9 @@ void dlsum_fmod_inv_master
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = 1;//ceil(CACHELINE/(double)dword);
-	aln_i = 1;//ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 	ldalsum=Llu->ldalsum;
 
@@ -1325,10 +1318,11 @@ void dlsum_fmod_inv_master
 					 * Send Xk to process column Pc[k].
 					 */
 
-					if(LBtree_ptr[lk].empty_==NO){
-						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
+					if(LBtree_ptr[lk].empty_==NO) {
+						//BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
 						C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
 					}
+
 					/*
 					 * Perform local block modifications.
 					 */
@@ -1365,7 +1359,7 @@ void dlsum_bmod_inv
  double *rtemp,   /* Result of full matrix-vector multiply.             */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -1395,13 +1389,13 @@ void dlsum_bmod_inv
 	int_t  *lsub;
 	double *lusup;
 	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
+	int  *brecv = Llu->brecv;
+	int    **bsendx_plist = Llu->bsendx_plist;
 	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
 	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
-	int_t bmod_tmp;
+	int bmod_tmp;
 	int thread_id1;
 	double *rtemp_loc;
 	int_t nroot_send_tmp;
@@ -1412,9 +1406,9 @@ void dlsum_bmod_inv
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof(double);
-	int_t aln_d,aln_i;
-	aln_d = 1;//ceil(CACHELINE/(double)dword);
-	aln_i = 1;//ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	iam = grid->iam;
@@ -1829,7 +1823,7 @@ void dlsum_bmod_inv_master
  double *rtemp,   /* Result of full matrix-vector multiply.             */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -1857,8 +1851,8 @@ void dlsum_bmod_inv_master
 	int_t  *lsub;
 	double *lusup;
 	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
+	int *brecv = Llu->brecv;
+	int  **bsendx_plist = Llu->bsendx_plist;
 	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
 	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
@@ -1874,9 +1868,9 @@ void dlsum_bmod_inv_master
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = 1;//ceil(CACHELINE/(double)dword);
-	aln_i = 1;//ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	rtemp_loc = &rtemp[sizertemp* thread_id];
@@ -2023,7 +2017,7 @@ void dlsum_bmod_inv_master
 		#endif
 					for (jj=0;jj=2 )
@@ -2112,8 +2106,8 @@ void dlsum_bmod_inv_master
 						// fflush(stdout);
 					// }
 					if(UBtree_ptr[lk1].empty_==NO){
-					// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'d')*nrhs+XK_H,'d');
-					C_BcTree_forwardMessageSimple(&UBtree_ptr[lk1], &x[ii - XK_H], UBtree_ptr[lk1].msgSize_*nrhs+XK_H);
+					  //BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'d')*nrhs+XK_H,'d');
+					  C_BcTree_forwardMessageSimple(&UBtree_ptr[lk1], &x[ii - XK_H], UBtree_ptr[lk1].msgSize_*nrhs+XK_H);
 					}
 
 					/*
diff --git a/SRC/pdgstrs_lsum_cuda.cu b/SRC/pdgstrs_lsum_cuda.cu
index 30c6b596..cc5cccc1 100644
--- a/SRC/pdgstrs_lsum_cuda.cu
+++ b/SRC/pdgstrs_lsum_cuda.cu
@@ -1009,7 +1009,7 @@ __device__ void C_RdTree_forwardMessageSimple_Device(C_Tree* Tree, void* localBu
   int   nrhs,      /* Number of right-hand sides.                        */
   int   maxsup,      /* Max supernode size.                        */
   int_t   nsupers,      /* Number of total supernodes.                        */
-  int_t *fmod,     /* Modification count for L-solve.                    */
+  int *fmod,     /* Modification count for L-solve.                    */
   C_Tree  *LBtree_ptr,
   C_Tree  *LRtree_ptr,
   int_t *ilsum,
@@ -1034,7 +1034,8 @@ __device__ void C_RdTree_forwardMessageSimple_Device(C_Tree* Tree, void* localBu
 	 int    iam, iknsupc, myrow, mycol, krow, nbrow, nbrow1, nsupr,m;
 	 int_t  k,i, l,ii,ik, il, irow, j, lb, lk, rel, lib;
 	 int_t  *lsub, *lloc;
-	 int_t  luptr_tmp1,lptr1_tmp, idx_i, idx_v, fmod_tmp;
+	 int_t  luptr_tmp1,lptr1_tmp, idx_i, idx_v;
+	 int fmod_tmp;
 	//  MPI_Status status;
 	//  const int Nbk=1;
 	//  __shared__ double rtemp_loc[128]; 
@@ -1487,7 +1488,7 @@ __device__ void C_RdTree_forwardMessageSimple_Device(C_Tree* Tree, void* localBu
    double *x,       /* X array (local)                                    */
    int   nrhs,      /* Number of right-hand sides.                        */
    int_t   nsupers,      /* Number of total supernodes.                        */
-   int_t *bmod,     /* Modification count for U-solve.                    */
+   int *bmod,     /* Modification count for U-solve.                    */
    C_Tree  *UBtree_ptr,
    C_Tree  *URtree_ptr,
    int_t *ilsum,
@@ -1783,7 +1784,7 @@ __device__ void C_RdTree_forwardMessageSimple_Device(C_Tree* Tree, void* localBu
   int   nrhs,      /* Number of right-hand sides.                        */
   int   maxsup,      /* Max supernode size.                        */
   int_t   nsupers,      /* Number of total supernodes.                        */
-  int_t *fmod,     /* Modification count for L-solve.                    */
+  int *fmod,     /* Modification count for L-solve.                    */
   C_Tree  *LBtree_ptr,
   C_Tree  *LRtree_ptr,
   int_t *ilsum,
@@ -1838,7 +1839,7 @@ __device__ void C_RdTree_forwardMessageSimple_Device(C_Tree* Tree, void* localBu
   int   nrhs,      /* Number of right-hand sides.                        */
   int   maxsup,      /* Max supernode size.                        */
   int_t   nsupers,      /* Number of total supernodes.                        */
-  int_t *bmod,     /* Modification count for L-solve.                    */
+  int *bmod,     /* Modification count for L-solve.                    */
   C_Tree  *UBtree_ptr,
   C_Tree  *URtree_ptr,
   int_t *ilsum,
diff --git a/SRC/pdsymbfact_distdata.c b/SRC/pdsymbfact_distdata.c
index da436174..1bf539dc 100644
--- a/SRC/pdsymbfact_distdata.c
+++ b/SRC/pdsymbfact_distdata.c
@@ -1218,7 +1218,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int_t *index;        /* indices consist of headers and row subscripts */
   int   *index1;       /* temporary pointer to array of int */
   double *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
+  int *recvBuf;  //int_t *recvBuf;
   int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
   double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   double *Linv_bc_dat;  /* size sum of sizes of Linv_bc_ptr[lk])                 */   
@@ -1274,17 +1274,18 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int  *ToRecv, *ToSendD, **ToSendR;
 
   /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
+  int  *fmod;          /* Modification count for L-solve.        */
+  int  **fsendx_plist; /* Column process list to send down Xk.   */
+  int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nfsendx = 0;    /* Number of Xk I will send               */
+  int  kseen;
 
   /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int  *bmod;          /* Modification count for U-solve.        */
+  int  **bsendx_plist; /* Column process list to send down Xk.   */
+  int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nbsendx = 0;    /* Number of Xk I will send               */
+  
   int_t  *ilsum;         /* starting position of each supernode in
 			    the full array (local)                 */
   int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
@@ -1309,8 +1310,9 @@ double *dense, *dense_col; /* SPA */
   int_t ldaspa;     /* LDA of SPA */
   int_t iword, dword;
   float mem_use = 0.0;
-  int_t *mod_bit;
-  int_t *frecv, *brecv, *lloc;
+  int *mod_bit;
+  int *frecv, *brecv;
+  int_t *lloc;
   double *SeedSTD_BC,*SeedSTD_RD;
   int_t idx_indx,idx_lusup;
   int_t nbrow;
@@ -1512,11 +1514,11 @@ double *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(fmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for fmod[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(bmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for bmod[].");
     return (memDist + memNLU + memTRS);
   }
@@ -1595,29 +1597,29 @@ double *dense, *dense_col; /* SPA */
   Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
 
   /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+  if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
   len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fsendx_plist[i] = &index1[j];
+  if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
+    bsendx_plist[i] = &index1[j];
   /* -------------------------------------------------------------- */
   memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
 
@@ -2063,7 +2065,7 @@ double *dense, *dense_col; /* SPA */
 
   /* exchange information about bsendx_plist in between column of processors */
   k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+  if ( !(recvBuf = (int *) SUPERLU_MALLOC(nsupers*k * sizeof(int))) ) {
     fprintf (stderr, "Malloc fails for recvBuf[].");
     return (memDist + memNLU + memTRS);
   }
@@ -2119,8 +2121,9 @@ double *dense, *dense_col; /* SPA */
     }
   }
 
-  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
-		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+  //MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, MPI_INT,
+		 recvBuf, nnzToRecv, ptrToRecv, MPI_INT, grid->comm);
 
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
@@ -2151,7 +2154,8 @@ double *dense, *dense_col; /* SPA */
   }
 
   /* exchange information about bsendx_plist in between column of processors */
-  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+  //MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, MPI_INT,
 		 MPI_MAX, grid->cscp.comm);
 
   for (jb = 0; jb < nsupers; jb ++) {
@@ -2600,9 +2604,9 @@ double *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for L ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(frecv = intMalloc_dist(nlb)) )
+		if ( !(frecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for frecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2617,7 +2621,8 @@ double *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
-		MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+		//MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+		MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 
 
@@ -2916,9 +2921,9 @@ double *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for U ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(brecv = intMalloc_dist(nlb)) )
+		if ( !(brecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for brecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2933,7 +2938,8 @@ double *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
-		MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+		//MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+		MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 
 
@@ -3220,7 +3226,7 @@ double *dense, *dense_col; /* SPA */
 #endif
 
   k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+  if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
       ABORT("Malloc fails for mod_bit[].");
 
   /* Find the maximum buffer size. */
diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c
index 30c114cb..57f5bb62 100644
--- a/SRC/psdistribute.c
+++ b/SRC/psdistribute.c
@@ -361,7 +361,7 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
  *
- * LUstruct (input) sLUstruct_t*
+ * LUstruct (input/output) sLUstruct_t*
  *        Data structures for L and U factors.
  *
  * grid   (input) gridinfo_t*
@@ -408,10 +408,10 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	float **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
 
-	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
-	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	C_Tree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	C_Tree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
@@ -421,17 +421,17 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
     int  *ToRecv, *ToSendD, **ToSendR;
 
     /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
+    int  *fmod;          /* Modification count for L-solve.        */
+    int  **fsendx_plist; /* Column process list to send down Xk.   */
+    int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nfsendx = 0;    /* Number of Xk I will send               */
+    int  kseen;
 
     /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int  *bmod;          /* Modification count for U-solve.        */
+    int  **bsendx_plist; /* Column process list to send down Xk.   */
+    int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nbsendx = 0;    /* Number of Xk I will send               */
     int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
@@ -460,8 +460,9 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
     float mem_use = 0.0;
     float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
 
-    int_t *mod_bit;
-    int_t *frecv, *brecv, *lloc;
+    int   *mod_bit;  // Sherry 1/16/2022: changed to 'int'
+    int   *frecv, *brecv;
+    int_t *lloc;
     float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     float **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     double *SeedSTD_BC,*SeedSTD_RD;
@@ -807,9 +808,9 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	    ABORT("Calloc fails for SPA dense[].");
 
 	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
+	if ( !(fmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
+	if ( !(bmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for bmod[].");
 
 	/* ------------------------------------------------ */
@@ -821,6 +822,7 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	if ( !(Lnzval_bc_ptr =
               (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
 	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	Lnzval_bc_ptr[k-1] = NULL;	
 	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
 	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
 	Lrowind_bc_ptr[k-1] = NULL;
@@ -847,21 +849,21 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 
 
 	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
 	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    fsendx_plist[i] = &index1[j];
+	if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
+	    bsendx_plist[i] = &index1[j];
 	/* -------------------------------------------------------------- */
 	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
 	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
@@ -1234,7 +1236,7 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+	if ( !(LBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1251,13 +1253,13 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb nprow*k)) )
 		ABORT("Calloc fails for ActiveFlag[].");
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+	memTRS += k*sizeof(C_Tree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
 	for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers;
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
@@ -1321,8 +1323,10 @@ psdistribute(fact_t fact, int_t n, SuperMatrix *A,
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				//LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				LBtree_ptr[ljb].tag_=BC_L;
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
@@ -1373,9 +1377,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(frecv = intMalloc_dist(nlb)) )
+	if ( !(frecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for frecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1390,12 +1394,15 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0 // Sherry: 1/26/2022	   
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
+#else	
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif
 
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+	if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1438,14 +1445,14 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 
 
 	for (lib = 0; lib npcol*k)) )
 		ABORT("Calloc fails for ActiveFlagAll[].");
 	for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
-	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+	memTRS += k*sizeof(C_Tree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
 		pc = PCOL( jb, grid );
@@ -1504,8 +1511,10 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 
 					// if(ib==0){
 
-					LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-					RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					//LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					LRtree_ptr[lib].tag_=RD_L;
 					// }
 
 					// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
@@ -1561,7 +1570,7 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 	/* construct the Bcast tree for U ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+	if ( !(UBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1578,13 +1587,13 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 
 
 	for (ljb = 0; ljb nprow*k)) )
 		ABORT("Calloc fails for ActiveFlagAll[].");
 	for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
-	memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+	memTRS += k*sizeof(C_Tree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
 
 	for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 		jb = mycol+ljb*grid->npcol;  /* not sure */
@@ -1662,8 +1671,10 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				//UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				UBtree_ptr[ljb].tag_=BC_U;
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
@@ -1702,9 +1713,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(brecv = intMalloc_dist(nlb)) )
+	if ( !(brecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for brecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1719,12 +1730,12 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
+	//MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+	if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1786,14 +1797,14 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 
 
 	for (lib = 0; lib npcol*k)) )
 		ABORT("Calloc fails for ActiveFlagAll[].");
 	for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers;
-	memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+	memTRS += k*sizeof(C_Tree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
 
 	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
 		fsupc = FstBlockC( jb );
@@ -1865,8 +1876,10 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 
 					// if(ib==0){
 
-					URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-					RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					//URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					URtree_ptr[lib].tag_=RD_U;
 					// }
 
 					// #if ( PRNTlevel>=1 )
@@ -1959,7 +1972,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
 		      MPI_MAX, grid->comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
 	    ABORT("Malloc fails for mod_bit[].");
 
 #if ( PROFlevel>=1 )
diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c
index 73020a02..ac704842 100644
--- a/SRC/psgssvx.c
+++ b/SRC/psgssvx.c
@@ -528,19 +528,20 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	      routine. They will be freed after PDDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
 	      structures are not used.                                  */
-    fact_t   Fact;
-    float   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam,iam_g;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
+    fact_t  Fact;
+    float *a;
+    int_t   *colptr, *rowind;
+    int_t   *perm_r; /* row permutations from partial pivoting */
+    int_t   *perm_c; /* column permutation vector */
+    int_t   *etree;  /* elimination tree */
+    int_t   *rowptr, *colind;  /* Local A in NR*/
+    int_t   nnz_loc, nnz, iinfo;
+    int     m_loc, fst_row, icol;
+    int     colequ, Equil, factored, job, notran, rowequ, need_value;
+    int     i, j, irow, m, n, permc_spec;
+    int     iam, iam_g;
+    int     ldx;  /* LDA for matrix X (local). */
+    char    equed[1], norm[1];
     float   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
     float   *X, *b_col, *b_work, *x_col;
     double   t;
@@ -719,11 +720,11 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    if ( iinfo > 0 ) {
 		if ( iinfo <= m ) {
 #if ( PRNTlevel>=1 )
-		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+		    fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
 #endif
 		} else {
 #if ( PRNTlevel>=1 )
-                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+                    fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
 #endif
                 }
  	    } else if ( iinfo < 0 ) return;
@@ -927,7 +928,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        stat->utime[ROWPERM] = t;
 #if ( PRNTlevel>=1 )
                 if ( !iam ) {
-		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    printf(".. LDPERM job %d\t time: %.2f\n", job, t);
 		    fflush(stdout);
 		}
 #endif
@@ -1401,10 +1402,26 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	       factorization with Fact == DOFACT or SamePattern is asked for. */
 	}
 
-	if ( options->DiagInv==YES &&
-             (options->SolveInitialized == NO || Fact == SamePattern ||
-              Fact == SamePattern_SameRowPerm) ) {
+#ifdef GPU_ACC
+        if(options->DiagInv==NO){
+	    printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n");
+	    fflush(stdout);
+	    //exit(0);  // Sherry: need to return an error flag
+	}
+#endif
+
+	if ( options->DiagInv==YES && (Fact != FACTORED) ) {
 	    psCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+#ifdef GPU_ACC
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+	        (LUstruct->Llu->Linv_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+	        (LUstruct->Llu->Uinv_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+	        (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat,
+	        (LUstruct->Llu->Unzval_br_cnt) * sizeof(float), gpuMemcpyHostToDevice));
+#endif
 	}
 
 
diff --git a/SRC/psgstrf.c b/SRC/psgstrf.c
index e43d745d..20c3d1c3 100644
--- a/SRC/psgstrf.c
+++ b/SRC/psgstrf.c
@@ -110,10 +110,11 @@ at the top-level directory.
 
 #include 
 #include "superlu_sdefs.h"
+#include "gpu_api_utils.h"
+
 #ifdef GPU_ACC
 // #define NUM_GPU_STREAMS 16
 // #define NUM_GPU_STREAMS 16
-#include "gpu_api_utils.h"
 #endif
 
 /* Various defininations     */
@@ -771,7 +772,7 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
     int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
                                      //   get_max_buffer_size());
     /* array holding last column blk for each partition,
-       used in SchCompUdt-cuda.c         */
+       used in SchCompUdt-GPU.c         */
   #if 0
     int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
   #else
@@ -815,14 +816,14 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
 	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
         printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
                 max_row_size, max_ncols);
+        printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
+        fflush(stdout);
     }
-    printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
-    fflush(stdout);
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
 
-    if ( checkGPU(gpuHostAlloc((void**)&bigU,  bigu_size * sizeof(float), gpuHostAllocDefault)) )
+    if ( checkGPU(gpuHostMalloc((void**)&bigU,  bigu_size * sizeof(float), gpuHostMallocDefault)) )
         ABORT("Malloc fails for sgemm buffer U ");
 
 #if 0 // !!Sherry fix -- only dC on GPU uses buffer_size
@@ -830,18 +831,20 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
-            iam, bigv_size, buffer_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+                iam, bigv_size, buffer_size);
+        fflush(stdout);
+    }
 #endif
 
-    if ( checkGPU(gpuHostAlloc((void**)&bigV, bigv_size * sizeof(float) ,gpuHostAllocDefault)) )
+    if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(float), gpuHostMallocDefault)) )
         ABORT("Malloc fails for sgemm buffer V");
 
 #if ( PRNTlevel>=1 )
     if ( iam==0 ) {
         DisplayHeader();
-	printf(" Starting with %d gpu streams \n",nstreams );
+	printf(" Starting with %d GPU Streams \n", nstreams);
         fflush(stdout);
     }
 #endif
@@ -897,8 +900,10 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
+        fflush(stdout);
+    }
 #endif
 
 //#ifdef __INTEL_COMPILER
@@ -1734,7 +1739,7 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
 
 #ifdef GPU_ACC /*-- GPU --*/
 
-#include "sSchCompUdt-cuda.c"
+#include "sSchCompUdt-gpu.c"
 
 #else
 
@@ -1928,7 +1933,7 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
     /* Prepare error message - find the smallesr index i that U(i,i)==0 */
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
 
 #if ( PROFlevel>=1 )
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index d1daf355..967d37c5 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -357,7 +357,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
     int iinfo;
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
     //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
 
diff --git a/SRC/psgstrs.c b/SRC/psgstrs.c
index 3658bd6c..922d2bee 100644
--- a/SRC/psgstrs.c
+++ b/SRC/psgstrs.c
@@ -865,10 +865,10 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
     int_t nroot_send, nroot_send_tmp;
     int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
         /*-- Data structures used for broadcast and reduction trees. --*/
-    BcTree  *LBtree_ptr = Llu->LBtree_ptr;
-    RdTree  *LRtree_ptr = Llu->LRtree_ptr;
-    BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-    RdTree  *URtree_ptr = Llu->URtree_ptr;
+    C_Tree  *LBtree_ptr = Llu->LBtree_ptr;
+    C_Tree  *LRtree_ptr = Llu->LRtree_ptr;
+    C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
+    C_Tree  *URtree_ptr = Llu->URtree_ptr;
     int_t  *Urbs1; /* Number of row blocks in each block column of U. */
     int_t  *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
@@ -894,30 +894,30 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
 
     double tmax;
     	/*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
+    int  *fmod;         /* Modification count for L-solve --
     			 Count the number of local block products to
     			 be summed into lsum[lk]. */
-    int_t fmod_tmp;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  nfrecvx_buf=0;
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+    int  fmod_tmp;
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  nfrecvx_buf=0;
+    int  *frecv;        /* Count of lsum[lk] contributions to be received
     			     from processes in this row.
     			     It is only valid on the diagonal processes. */
-    int_t  frecv_tmp;
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nfrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nbrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-    int_t  nleaftmp = 0, nroottmp = 0;
+    int  frecv_tmp;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nfrecv = 0; /* Count of total messages to be recv'd. */
+    int  nbrecv = 0; /* Count of total messages to be recv'd. */
+    int  nleaf = 0, nroot = 0;
+    int  nleaftmp = 0, nroottmp = 0;
     int_t  msgsize;
         /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  bmod_tmp;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  nbrecvx_buf=0;
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for U-solve. */
+    int  bmod_tmp;
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  nbrecvx_buf=0;
+    int  *brecv;        /* Count of modifications to be recv'd from
     			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     int_t flagx,flaglsum,flag;
@@ -930,7 +930,7 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
 
     int_t gik,iklrow,fnz;
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
     int INFO, pad;
     int_t tmpresult;
 
@@ -958,8 +958,8 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
     int thread_id = 0;
     yes_no_t empty;
     int_t sizelsum,sizertemp,aln_d,aln_i;
-    aln_d = ceil(CACHELINE/(double)dword);
-    aln_i = ceil(CACHELINE/(double)iword);
+    aln_d = 1; //ceil(CACHELINE/(double)dword);
+    aln_i = 1; //ceil(CACHELINE/(double)iword);
     int num_thread = 1;
 
     maxsuper = sp_ienv_dist(3);
@@ -1030,10 +1030,10 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS. */
-    if ( !(fmod = intMalloc_dist(nlb*aln_i)) )
+    if ( !(fmod = int32Malloc_dist(nlb*aln_i)) )
 	ABORT("Malloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i];
-    if ( !(frecv = intCalloc_dist(nlb)) )
+    if ( !(frecv = int32Calloc_dist(nlb)) )
 	ABORT("Calloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -1140,13 +1140,15 @@ psgstrs(int_t n, sLUstruct_t *LUstruct,
 
 	nbtree = 0;
 	for (lk=0;lk0)nfrecvx_buf++;
+				//if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0)nfrecvx_buf++;
+				if(LBtree_ptr[lk].destCnt_>0)nfrecvx_buf++;
 			}
-			BcTree_allocateRequest(LBtree_ptr[lk],'s');
+			//BcTree_allocateRequest(LBtree_ptr[lk],'s');
 		}
 	}
 
@@ -1172,10 +1174,11 @@ if(procs==1){
 	}
 }else{
 	for (lk=0;lknprow;  /* not sure */
@@ -1291,9 +1294,6 @@ if(procs==1){
 					&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
 
-#ifdef _OPENMP
-#pragma omp simd
-#endif
 			for (i=0 ; inpcol;  /* not sure */
 			lib = LBi( gb, grid ); /* Local block number, row-wise. */
 			ii = X_BLK( lib );
-			BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+			//BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+			C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
+			
 		}else{ // this is a reduce forwarding
 			lk = -lk - 1;
 			il = LSUM_BLK( lk );
-			RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+			//RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+			C_RdTree_forwardMessageSimple(&LRtree_ptr[lk],&lsum[il - LSUM_H],LRtree_ptr[lk].msgSize_*nrhs+LSUM_H);
+			
 		}
 	}
 
@@ -1513,9 +1517,10 @@ if(procs==1){
 				{
 				lk = LBj( k, grid );    /* local block number */
 
-				if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0){
+				if(LBtree_ptr[lk].destCnt_>0){
 
-					BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+					//BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+					C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], recvbuf0, LBtree_ptr[lk].msgSize_*nrhs+XK_H);
 					// nfrecvx_buf++;
 				}
 
@@ -1564,13 +1569,10 @@ if(procs==1){
 				thread_id = 0;
 				rtemp_loc = &rtemp[sizertemp* thread_id];
 				if ( fmod_tmp==0 ) {
-				    if(RdTree_IsRoot(LRtree_ptr[lk],'s')==YES){
+				    if(C_RdTree_IsRoot(&LRtree_ptr[lk])==YES){
 				    // ii = X_BLK( lk );
 					knsupc = SuperSize( k );
 					for (ii=1;ii=2 )
 	t = SuperLU_timer_() - t;
@@ -1723,17 +1725,18 @@ if(procs==1){
 	log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
 
 	for (lk=0;lkbmod[i];
-	if ( !(brecv = intCalloc_dist(nlb)) )
+	if ( !(brecv = int32Calloc_dist(nlb)) )
 		ABORT("Calloc fails for brecv[].");
 	Llu->brecv = brecv;
 
@@ -1843,13 +1846,13 @@ if(procs==1){
 
 	nbtree = 0;
 	for (lk=0;lk0)nbrecvx_buf++;
+			if(UBtree_ptr[lk].destCnt_>0)nbrecvx_buf++;
 		}
-		BcTree_allocateRequest(UBtree_ptr[lk],'s');
+		//BcTree_allocateRequest(UBtree_ptr[lk],'s');
 	    }
 	}
 
@@ -1860,12 +1863,12 @@ if(procs==1){
 	nrtree = 0;
 	nroot=0;
 	for (lk=0;lknprow;  /* not sure */
@@ -1881,9 +1884,6 @@ if(procs==1){
 		}
 	}
 
-#ifdef _OPENMP
-#pragma omp simd
-#endif
 	for (i = 0; i < nlb; ++i) bmod[i*aln_i] += brecv[i];
 	// for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]);
 
@@ -1965,9 +1965,6 @@ if(procs==1){
 						&alpha, Uinv, &knsupc, &x[ii],
 						&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
-				#ifdef _OPENMP
-					#pragma omp simd
-				#endif
 				for (i=0 ; inpcol;  /* not sure */
 		lib = LBi( gb, grid ); /* Local block number, row-wise. */
 		ii = X_BLK( lib );
-		BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+		//BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+		C_BcTree_forwardMessageSimple(&UBtree_ptr[lk], &x[ii - XK_H], UBtree_ptr[lk].msgSize_*nrhs+XK_H);
 	}else{ // this is a reduce forwarding
 		lk = -lk - 1;
 		il = LSUM_BLK( lk );
-		RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+		//RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s');
+		C_RdTree_forwardMessageSimple(&URtree_ptr[lk],&lsum[il - LSUM_H ],URtree_ptr[lk].msgSize_*nrhs+LSUM_H);
 	}
 }
 
@@ -2105,9 +2104,10 @@ for (i=0;i0){
+		    if(UBtree_ptr[lk].destCnt_>0){
 
-			BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s');
+			// BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
+			C_BcTree_forwardMessageSimple(&UBtree_ptr[lk], recvbuf0, UBtree_ptr[lk].msgSize_*nrhs+XK_H);
 			// nfrecvx_buf++;
 		    }
 
@@ -2127,9 +2127,6 @@ for (i=0;iutime[SOL_COMM]);
 		tmp4 += stat_loc[i]->ops[SOLVE];
 #if ( PRNTlevel>=2 )
-		f(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
+		if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
 #endif
 	}
 
@@ -2330,17 +2321,17 @@ for (i=0;ifsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -145,7 +146,7 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -179,10 +180,10 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PSGSTRS1. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -249,11 +250,12 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
 	    }
 	}
-	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	/*PrintInt32("mod_bit", nlb, mod_bit);*/
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	//MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -528,10 +530,10 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PSGSTRS1. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -555,7 +557,11 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -583,8 +589,13 @@ void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 		if ( mycol != kcol && bmod[lk] )
 		    i = 1;  /* Contribution from non-diagonal process. */
 		else i = 0;
+#if 0 // Sherry		
 		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
 			   MPI_SUM, kcol, scp->comm );
+#else			   
+		MPI_Reduce( &i, &brecv[lk], 1, MPI_INT, MPI_SUM, kcol, scp->comm );
+#endif
+
 		if ( mycol == kcol ) { /* Diagonal process. */
 		    nbrecvmod += brecv[lk];
 		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
diff --git a/SRC/psgstrs_Bglobal.c b/SRC/psgstrs_Bglobal.c
index eb972a16..d7557597 100644
--- a/SRC/psgstrs_Bglobal.c
+++ b/SRC/psgstrs_Bglobal.c
@@ -134,19 +134,19 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 #endif
 
     /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -154,7 +154,7 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -189,10 +189,10 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -275,7 +275,11 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif	
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -564,10 +568,10 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -591,7 +595,11 @@ psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c
index ead09ab5..a67fd47e 100644
--- a/SRC/psgstrs_lsum.c
+++ b/SRC/psgstrs_lsum.c
@@ -67,7 +67,7 @@ void slsum_fmod
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t lptr,      /* Starting position in lsub[*].                      */
  int_t luptr,     /* Starting position in lusup[*].                     */
@@ -85,8 +85,8 @@ void slsum_fmod
     int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel;
     int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -249,7 +249,7 @@ void slsum_bmod
  float *xk,          /* X[k].                                          */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -274,8 +274,8 @@ void slsum_bmod
     int_t  *lsub;
     float *lusup;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *brecv = Llu->brecv;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int  *brecv = Llu->brecv;
+    int    **bsendx_plist = Llu->bsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -420,7 +420,7 @@ void slsum_fmod_inv
  float *rtemp,   /* Result of full matrix-vector multiply.             */
  int   nrhs,      /* Number of right-hand sides.                        */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t *xsup,
  gridinfo_t *grid,
  sLocalLU_t *Llu,
@@ -443,16 +443,16 @@ void slsum_fmod_inv
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	flops_t ops_loc=0.0;
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
-	BcTree  *LBtree_ptr = Llu->LBtree_ptr;
-	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+	C_Tree  *LBtree_ptr = Llu->LBtree_ptr;
+	C_Tree  *LRtree_ptr = Llu->LRtree_ptr;
 	int_t* idx_lsum,idx_lsum1;
 	float *rtemp_loc;
 	int_t ldalsum;
@@ -461,9 +461,9 @@ void slsum_fmod_inv
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (float);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 	int   knsupc;    /* Size of supernode k.                               */
 	int_t nlb;       /* Number of L blocks.                                */
 
@@ -713,7 +713,7 @@ void slsum_fmod_inv
 							 * Send Xk to process column Pc[k].
 							 */
 
-							if(LBtree_ptr[lk]!=NULL){
+							if(LBtree_ptr[lk].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -906,7 +906,7 @@ void slsum_fmod_inv
 					 * Send Xk to process column Pc[k].
 					 */
 
-					if(LBtree_ptr[lk]!=NULL){
+					if(LBtree_ptr[lk].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -961,7 +961,7 @@ void slsum_fmod_inv_master
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t *xsup,
  gridinfo_t *grid,
@@ -983,8 +983,8 @@ void slsum_fmod_inv_master
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	int m;
@@ -992,8 +992,8 @@ void slsum_fmod_inv_master
     MPI_Status status;
     int test_flag;
 	yes_no_t done;
-	BcTree  *LBtree_ptr = Llu->LBtree_ptr;
-	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+	C_Tree  *LBtree_ptr = Llu->LBtree_ptr;
+	C_Tree  *LRtree_ptr = Llu->LRtree_ptr;
 	int_t* idx_lsum,idx_lsum1;
 	float *rtemp_loc;
 	int_t ldalsum;
@@ -1002,9 +1002,9 @@ void slsum_fmod_inv_master
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (float);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 	ldalsum=Llu->ldalsum;
 
@@ -1235,7 +1235,8 @@ void slsum_fmod_inv_master
 						for (jj=0;jjilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
-	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;
+	int  *brecv = Llu->brecv;
+	int    **bsendx_plist = Llu->bsendx_plist;
+	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
+	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
-	int_t bmod_tmp;
+	int bmod_tmp;
 	int thread_id1;
 	float *rtemp_loc;
 	int_t nroot_send_tmp;
@@ -1403,9 +1406,9 @@ void slsum_bmod_inv
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof(float);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	iam = grid->iam;
@@ -1592,7 +1595,7 @@ void slsum_bmod_inv
 								// printf("xre: %f\n",x[ii+i]);
 								// fflush(stdout);
 							// }
-							if(UBtree_ptr[lk1]!=NULL){
+							if(UBtree_ptr[lk1].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -1772,7 +1775,7 @@ void slsum_bmod_inv
 							// printf("xre: %f\n",x[ii+i]);
 							// fflush(stdout);
 						// }
-						if(UBtree_ptr[lk1]!=NULL){
+						if(UBtree_ptr[lk1].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -1820,7 +1823,7 @@ void slsum_bmod_inv_master
  float *rtemp,   /* Result of full matrix-vector multiply.             */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -1848,10 +1851,10 @@ void slsum_bmod_inv_master
 	int_t  *lsub;
 	float *lusup;
 	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
-	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
-	RdTree  *URtree_ptr = Llu->URtree_ptr;
+	int *brecv = Llu->brecv;
+	int  **bsendx_plist = Llu->bsendx_plist;
+	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
+	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
 	int_t bmod_tmp;
@@ -1865,9 +1868,9 @@ void slsum_bmod_inv_master
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (float);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	rtemp_loc = &rtemp[sizertemp* thread_id];
@@ -2014,7 +2017,8 @@ void slsum_bmod_inv_master
 		#endif
 					for (jj=0;jj=2 )
 				printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n",
@@ -2101,8 +2105,9 @@ void slsum_bmod_inv_master
 						// printf("xre: %f\n",x[ii+i]);
 						// fflush(stdout);
 					// }
-					if(UBtree_ptr[lk1]!=NULL){
-					BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'s')*nrhs+XK_H,'s');
+					if(UBtree_ptr[lk1].empty_==NO){
+					  //BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'s')*nrhs+XK_H,'s');
+					  C_BcTree_forwardMessageSimple(&UBtree_ptr[lk1], &x[ii - XK_H], UBtree_ptr[lk1].msgSize_*nrhs+XK_H);
 					}
 
 					/*
diff --git a/SRC/pssymbfact_distdata.c b/SRC/pssymbfact_distdata.c
index 1488ee18..44c1df15 100644
--- a/SRC/pssymbfact_distdata.c
+++ b/SRC/pssymbfact_distdata.c
@@ -628,8 +628,8 @@ dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
       while (i < k + nnzToRecv[p]) {
 	gb = rcv_luind[i];
 	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
-		  iam, p, gb, nsupers, i, i-k);
+	  printf ("Pe[%d] p %d gb %d nsupers %d i " IFMT " i-k " IFMT "\n",
+		  iam, p, (int) gb, (int) nsupers, i, i-k);
 	i += 2;
 	if (sendL) gb_l = LBj( gb, grid );
 	if (sendU) gb_l = LBi( gb, grid );
@@ -1218,7 +1218,7 @@ sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int_t *index;        /* indices consist of headers and row subscripts */
   int   *index1;       /* temporary pointer to array of int */
   float *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
+  int *recvBuf;    // 1/16/22 Sherry changed to int, was:  int_t *recvBuf;
   int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
   float **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
@@ -1231,10 +1231,10 @@ sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
   int_t  *Unnz;  /* size ceil(NSUPERS/Pc) */
 
-  BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-  RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
-  BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-  RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+  C_Tree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+  C_Tree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+  C_Tree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+  C_Tree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
   int msgsize;
 
   int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
@@ -1246,17 +1246,17 @@ sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int  *ToRecv, *ToSendD, **ToSendR;
 
   /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
+  int  *fmod;          /* Modification count for L-solve.        */
+  int  **fsendx_plist; /* Column process list to send down Xk.   */
+  int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nfsendx = 0;    /* Number of Xk I will send               */
+  int  kseen;
 
   /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int  *bmod;          /* Modification count for U-solve.        */
+  int  **bsendx_plist; /* Column process list to send down Xk.   */
+  int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nbsendx = 0;    /* Number of Xk I will send               */
   int_t  *ilsum;         /* starting position of each supernode in
 			    the full array (local)                 */
   int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
@@ -1281,8 +1281,9 @@ float *dense, *dense_col; /* SPA */
   int_t ldaspa;     /* LDA of SPA */
   int_t iword, dword;
   float mem_use = 0.0;
-  int_t *mod_bit;
-  int_t *frecv, *brecv, *lloc;
+  int *mod_bit;
+  int *frecv, *brecv;
+  int_t *lloc;
   double *SeedSTD_BC,*SeedSTD_RD;
   int_t idx_indx,idx_lusup;
   int_t nbrow;
@@ -1467,11 +1468,11 @@ float *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(fmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for fmod[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(bmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for bmod[].");
     return (memDist + memNLU + memTRS);
   }
@@ -1519,29 +1520,29 @@ float *dense, *dense_col; /* SPA */
   Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
 
   /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+  if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
   len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fsendx_plist[i] = &index1[j];
+  if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
+    bsendx_plist[i] = &index1[j];
   /* -------------------------------------------------------------- */
   memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
 
@@ -1565,15 +1566,15 @@ float *dense, *dense_col; /* SPA */
 	    printf ("ERR7\n");
 	  jcol = asup_colind[i];
 	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
-		    iam, jb, gb, j, jcol);
+	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
+		    iam, (int) jb, (int) gb, (int) j, jcol);
 	  gb = BlockNum( jcol );
 	  lb = LBj( gb, grid );
 	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
 	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
 	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
-		    iam, jb, gb, j, jcol);
+	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
+		    iam, (int) jb, (int) gb, (int) j, jcol);
 	  dense_col[jcol] = asup_val[i];
 	}
 	dense_col += ldaspa_j;
@@ -1801,7 +1802,7 @@ float *dense, *dense_col; /* SPA */
 	Lrowind_bc_ptr[ljb_j] = index;
 	if (!(Lnzval_bc_ptr[ljb_j] =
 	      floatMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d\n", (int) jb);
 	  return (memDist + memNLU + memTRS);
 	}
 
@@ -1947,7 +1948,7 @@ float *dense, *dense_col; /* SPA */
 
   /* exchange information about bsendx_plist in between column of processors */
   k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+  if ( !(recvBuf = (int *) SUPERLU_MALLOC(nsupers*k* sizeof(int))) ) {
     fprintf (stderr, "Malloc fails for recvBuf[].");
     return (memDist + memNLU + memTRS);
   }
@@ -2003,8 +2004,13 @@ float *dense, *dense_col; /* SPA */
     }
   }
 
+#if 0 // Sherry 
   MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
 		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+#else		 
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, MPI_INT,
+		 recvBuf, nnzToRecv, ptrToRecv, MPI_INT, grid->comm);
+#endif
 
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
@@ -2035,8 +2041,13 @@ float *dense, *dense_col; /* SPA */
   }
 
   /* exchange information about bsendx_plist in between column of processors */
+#if 0 // Sherry 1/16/2022
   MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
 		 MPI_MAX, grid->cscp.comm);
+#else
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, MPI_INT,
+		 MPI_MAX, grid->cscp.comm);
+#endif
 
   for (jb = 0; jb < nsupers; jb ++) {
     jbcol = PCOL( jb, grid);
@@ -2151,7 +2162,7 @@ float *dense, *dense_col; /* SPA */
 		/* construct the Bcast tree for L ... */
 
 		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-		if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		if ( !(LBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 			ABORT("Malloc fails for LBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 			ABORT("Calloc fails for ActiveFlag[].");
@@ -2167,14 +2178,14 @@ float *dense, *dense_col; /* SPA */
 		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 		for (ljb = 0; ljb nprow*k)) )
 			ABORT("Calloc fails for ActiveFlag[].");
 		for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers;
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
+		memTRS += k*sizeof(C_Tree) + k*dword + grid->nprow*k*iword;  //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
 		for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */
 			jb = mycol+ljb*grid->npcol;  /* not sure */
 			if(jbcomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-					BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				//LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				LBtree_ptr[ljb].tag_=BC_L;
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 					// fflush(stdout);
@@ -2292,9 +2305,9 @@ float *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for L ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(frecv = intMalloc_dist(nlb)) )
+		if ( !(frecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for frecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2309,12 +2322,14 @@ float *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
+#if 0 // Sherry
 		MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+#else		
+		MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif
 
 		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 			ABORT("Malloc fails for LRtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 			ABORT("Calloc fails for ActiveFlag[].");
@@ -2338,14 +2353,14 @@ float *dense, *dense_col; /* SPA */
 
 
 		for (lib = 0; lib npcol*k)) )
 			ABORT("Calloc fails for ActiveFlagAll[].");
 		for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers;
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
+		memTRS += k*sizeof(C_Tree) + k*dword + grid->npcol*k*iword;  //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
 
 
 		for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */
@@ -2409,8 +2424,10 @@ float *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-						RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					//LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					LRtree_ptr[lib].tag_=RD_L;
 						// }
 
 						// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
@@ -2459,7 +2476,7 @@ float *dense, *dense_col; /* SPA */
 		/* construct the Bcast tree for U ... */
 
 		k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-		if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+		if ( !(UBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 			ABORT("Malloc fails for UBtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 			ABORT("Calloc fails for ActiveFlag[].");
@@ -2476,13 +2493,13 @@ float *dense, *dense_col; /* SPA */
 
 
 		for (ljb = 0; ljb nprow*k)) )
 			ABORT("Calloc fails for ActiveFlagAll[].");
 		for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers;
-		memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+		memTRS += k*sizeof(C_Tree) + k*dword + grid->nprow*k*iword;  //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
 
 
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
@@ -2563,8 +2580,10 @@ float *dense, *dense_col; /* SPA */
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-					UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-					BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				//UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				UBtree_ptr[ljb].tag_=BC_U;
 
 					// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 					// fflush(stdout);
@@ -2603,9 +2622,9 @@ float *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for U ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(brecv = intMalloc_dist(nlb)) )
+		if ( !(brecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for brecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2620,12 +2639,14 @@ float *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
+#if 0 // Sherry
 		MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+#else		
+		MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif		
 
 		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+		if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 			ABORT("Malloc fails for URtree_ptr[].");
 		if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 			ABORT("Calloc fails for ActiveFlag[].");
@@ -2648,14 +2669,14 @@ float *dense, *dense_col; /* SPA */
 		MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm);
 
 		for (lib = 0; lib npcol*k)) )
 			ABORT("Calloc fails for ActiveFlagAll[].");
 		for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers;
-		memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+		memTRS += k*sizeof(C_Tree) + k*dword + grid->npcol*k*iword;  //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
 
 		for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */
 			ib = myrow+lib*grid->nprow;  /* not sure */
@@ -2719,8 +2740,10 @@ float *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-						RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					//URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					URtree_ptr[lib].tag_=RD_U;
 						// }
 
 						// #if ( PRNTlevel>=1 )
@@ -2811,7 +2834,7 @@ float *dense, *dense_col; /* SPA */
 #endif
 
   k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+  if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
       ABORT("Malloc fails for mod_bit[].");
 
   /* Find the maximum buffer size. */
diff --git a/SRC/psutil.c b/SRC/psutil.c
index 04789940..cbb94fb4 100644
--- a/SRC/psutil.c
+++ b/SRC/psutil.c
@@ -479,10 +479,10 @@ sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
     /* The following can be freed only after iterative refinement. */
     SUPERLU_FREE(Llu->ilsum);
     SUPERLU_FREE(Llu->fmod);
-    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE((Llu->fsendx_plist)[0]);
     SUPERLU_FREE(Llu->fsendx_plist);
     SUPERLU_FREE(Llu->bmod);
-    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE((Llu->bsendx_plist)[0]);
     SUPERLU_FREE(Llu->bsendx_plist);
     SUPERLU_FREE(Llu->mod_bit);
 
@@ -859,31 +859,35 @@ sDestroy_Tree(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
 #if ( DEBUGlevel>=1 )
     int iam;
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    CHECK_MALLOC(iam, "Enter sDestroy_Tree()");
+    CHECK_MALLOC(iam, "Enter Destroy_Tree()");
 #endif
 
     nsupers = Glu_persist->supno[n-1] + 1;
 
     nb = CEILING(nsupers, grid->npcol);
     for (i=0;iLBtree_ptr[i]!=NULL){
-		BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+        if(Llu->LBtree_ptr[i].empty_==NO){    
+			// BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+            C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
+	}
+        if(Llu->UBtree_ptr[i].empty_==NO){  
+			// BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
+            C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
 	}
-	if(Llu->UBtree_ptr[i]!=NULL){
-		BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
-	}		
     }
     SUPERLU_FREE(Llu->LBtree_ptr);
     SUPERLU_FREE(Llu->UBtree_ptr);
 	
     nb = CEILING(nsupers, grid->nprow);
     for (i=0;iLRtree_ptr[i]!=NULL){
-		RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+        if(Llu->LRtree_ptr[i].empty_==NO){             
+			// RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+            C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
+	}
+        if(Llu->URtree_ptr[i].empty_==NO){ 
+			// RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
+            C_RdTree_Nullify(&Llu->URtree_ptr[i]);
 	}
-	if(Llu->URtree_ptr[i]!=NULL){
-		RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
-	}		
     }
     SUPERLU_FREE(Llu->LRtree_ptr);
     SUPERLU_FREE(Llu->URtree_ptr);
diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c
index fe2b722c..05882a72 100644
--- a/SRC/psymbfact.c
+++ b/SRC/psymbfact.c
@@ -648,7 +648,7 @@ float symbfact_dist
       printf("\tNo of supers   %ld\n", (long) nsuper);
       printf("\tSize of G(L)   %ld\n", (long) szLGr);
       printf("\tSize of G(U)   %ld\n", (long) szUGr);
-      printf("\tSize of G(L+U) %ld\n", (long) szLGr+szUGr);
+      printf("\tSize of G(L+U) %ld\n", (long) (szLGr+szUGr));
 
       printf("\tParSYMBfact (MB)      :\tL\\U MAX %.2f\tAVG %.2f\n",
 	     mem_glob[0]*1e-6, 
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
index bd0f7c7c..5bc4e731 100644
--- a/SRC/pzdistribute.c
+++ b/SRC/pzdistribute.c
@@ -360,7 +360,7 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
  * Glu_freeable (input) *Glu_freeable_t
  *        The global structure describing the graph of L and U.
  *
- * LUstruct (input) zLUstruct_t*
+ * LUstruct (input/output) zLUstruct_t*
  *        Data structures for L and U factors.
  *
  * grid   (input) gridinfo_t*
@@ -420,17 +420,17 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
     int  *ToRecv, *ToSendD, **ToSendR;
 
     /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
+    int  *fmod;          /* Modification count for L-solve.        */
+    int  **fsendx_plist; /* Column process list to send down Xk.   */
+    int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nfsendx = 0;    /* Number of Xk I will send               */
+    int  kseen;
 
     /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
+    int  *bmod;          /* Modification count for U-solve.        */
+    int  **bsendx_plist; /* Column process list to send down Xk.   */
+    int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nbsendx = 0;    /* Number of Xk I will send               */
     int_t  *ilsum;         /* starting position of each supernode in
 			      the full array (local)                 */
 
@@ -459,8 +459,9 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
     float mem_use = 0.0;
     float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
 
-    int_t *mod_bit;
-    int_t *frecv, *brecv, *lloc;
+    int   *mod_bit;  // Sherry 1/16/2022: changed to 'int'
+    int   *frecv, *brecv;
+    int_t *lloc;
     doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     doublecomplex **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     double *SeedSTD_BC,*SeedSTD_RD;
@@ -806,9 +807,9 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	    ABORT("Calloc fails for SPA dense[].");
 
 	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
+	if ( !(fmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
+	if ( !(bmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for bmod[].");
 
 	/* ------------------------------------------------ */
@@ -847,21 +848,21 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
 
 
 	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
 	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    fsendx_plist[i] = &index1[j];
+	if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
+	    bsendx_plist[i] = &index1[j];
 	/* -------------------------------------------------------------- */
 	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
 	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
@@ -1321,11 +1322,11 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A,
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				// LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
-				// BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
+				//LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
+				//BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
 				C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z');
 				LBtree_ptr[ljb].tag_=BC_L;
-				
+
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
 
@@ -1375,9 +1376,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(frecv = intMalloc_dist(nlb)) )
+	if ( !(frecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for frecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1392,8 +1393,11 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0 // Sherry: 1/26/2022	   
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
+#else	
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif
 
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1506,8 +1510,8 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 
 					// if(ib==0){
 
-					// LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
-					// RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
+					//LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
+					//RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
 					C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
 					LRtree_ptr[lib].tag_=RD_L;
 					// }
@@ -1666,10 +1670,11 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				// UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
-				// BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
+				//UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
+				//BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
 				C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z');
 				UBtree_ptr[ljb].tag_=BC_U;
+
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
 
@@ -1707,9 +1712,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(brecv = intMalloc_dist(nlb)) )
+	if ( !(brecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for brecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1724,8 +1729,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
+	//MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1870,8 +1875,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 
 					// if(ib==0){
 
-					// URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
-					// RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
+					//URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
+					//RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
 					C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
 					URtree_ptr[lib].tag_=RD_U;
 					// }
@@ -1966,7 +1971,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
 		      MPI_MAX, grid->comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
 	    ABORT("Malloc fails for mod_bit[].");
 
 #if ( PROFlevel>=1 )
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index 5decae78..743b02a9 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -527,19 +527,20 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	      routine. They will be freed after PDDISTRIBUTE routine.
 	      If options->Fact == SamePattern_SameRowPerm, these
 	      structures are not used.                                  */
-    fact_t   Fact;
-    doublecomplex   *a;
-    int_t    *colptr, *rowind;
-    int_t    *perm_r; /* row permutations from partial pivoting */
-    int_t    *perm_c; /* column permutation vector */
-    int_t    *etree;  /* elimination tree */
-    int_t    *rowptr, *colind;  /* Local A in NR*/
-    int_t    colequ, Equil, factored, job, notran, rowequ, need_value;
-    int_t    i, iinfo, j, irow, m, n, nnz, permc_spec;
-    int_t    nnz_loc, m_loc, fst_row, icol;
-    int      iam,iam_g;
-    int      ldx;  /* LDA for matrix X (local). */
-    char     equed[1], norm[1];
+    fact_t  Fact;
+    doublecomplex *a;
+    int_t   *colptr, *rowind;
+    int_t   *perm_r; /* row permutations from partial pivoting */
+    int_t   *perm_c; /* column permutation vector */
+    int_t   *etree;  /* elimination tree */
+    int_t   *rowptr, *colind;  /* Local A in NR*/
+    int_t   nnz_loc, nnz, iinfo;
+    int     m_loc, fst_row, icol;
+    int     colequ, Equil, factored, job, notran, rowequ, need_value;
+    int     i, j, irow, m, n, permc_spec;
+    int     iam, iam_g;
+    int     ldx;  /* LDA for matrix X (local). */
+    char    equed[1], norm[1];
     double   *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
     doublecomplex   *X, *b_col, *b_work, *x_col;
     double   t;
@@ -719,11 +720,11 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    if ( iinfo > 0 ) {
 		if ( iinfo <= m ) {
 #if ( PRNTlevel>=1 )
-		    fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
+		    fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
 #endif
 		} else {
 #if ( PRNTlevel>=1 )
-                    fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
+                    fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
 #endif
                 }
  	    } else if ( iinfo < 0 ) return;
@@ -928,7 +929,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	        stat->utime[ROWPERM] = t;
 #if ( PRNTlevel>=1 )
                 if ( !iam ) {
-		    printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
+		    printf(".. LDPERM job %d\t time: %.2f\n", job, t);
 		    fflush(stdout);
 		}
 #endif
@@ -1406,10 +1407,26 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	       factorization with Fact == DOFACT or SamePattern is asked for. */
 	}
 
-	if ( options->DiagInv==YES &&
-             (options->SolveInitialized == NO || Fact == SamePattern ||
-              Fact == SamePattern_SameRowPerm) ) {
+#ifdef GPU_ACC
+        if(options->DiagInv==NO){
+	    printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n");
+	    fflush(stdout);
+	    //exit(0);  // Sherry: need to return an error flag
+	}
+#endif
+
+	if ( options->DiagInv==YES && (Fact != FACTORED) ) {
 	    pzCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+#ifdef GPU_ACC
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+	        (LUstruct->Llu->Linv_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+	        (LUstruct->Llu->Uinv_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+	        (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
+            checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat,
+	        (LUstruct->Llu->Unzval_br_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
+#endif
 	}
 
 
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 1bbdba28..57f2afab 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -109,11 +109,11 @@ at the top-level directory.
 
 #include 
 #include "superlu_zdefs.h"
+#include "gpu_api_utils.h"
 
 #ifdef GPU_ACC
 // #define NUM_GPU_STREAMS 16
 // #define NUM_GPU_STREAMS 16
-#include "gpu_api_utils.h"
 #endif
 
 /* Various defininations     */
@@ -772,7 +772,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * gpublas_nb, sp_ienv_dist(8));
                                      //   get_max_buffer_size());
     /* array holding last column blk for each partition,
-       used in SchCompUdt--GPU.c         */
+       used in SchCompUdt-GPU.c         */
   #if 0
     int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64);
   #else
@@ -816,9 +816,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
         printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
                 max_row_size, max_ncols);
+        printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
+        fflush(stdout);
     }
-    printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size);
-    fflush(stdout);
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
@@ -831,17 +831,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
-            iam, bigv_size, buffer_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n",
+                iam, bigv_size, buffer_size);
+        fflush(stdout);
+    }
 #endif
-    if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,gpuHostMallocDefault)) )
+
+    if ( checkGPU(gpuHostMalloc((void**)&bigV, bigv_size * sizeof(doublecomplex), gpuHostMallocDefault)) )
         ABORT("Malloc fails for zgemm buffer V");
 
 #if ( PRNTlevel>=1 )
     if ( iam==0 ) {
         DisplayHeader();
-	printf(" Starting with %d GPU Streams \n",nstreams );
+	printf(" Starting with %d GPU Streams \n", nstreams);
         fflush(stdout);
     }
 #endif
@@ -873,14 +876,13 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     }
 
     // size of B should be bigu_size
-
     gpuStat = gpuMalloc((void**)&dB, bigu_size * sizeof(doublecomplex));
     if (gpuStat!= gpuSuccess) {
         fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(doublecomplex));
         return 1;
     }
 
-    gpuStat = gpuMalloc((void**)&dC, buffer_size* sizeof(doublecomplex) );
+    gpuStat = gpuMalloc((void**)&dC, buffer_size * sizeof(doublecomplex) );
     if (gpuStat!= gpuSuccess) {
         fprintf(stderr, "!!!! Error in allocating C in the device \n" );
         return 1;
@@ -898,8 +900,10 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
-    fflush(stdout);
+    if ( iam==0 ) {
+        printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
+        fflush(stdout);
+    }
 #endif
 
 //#ifdef __INTEL_COMPILER
@@ -1929,7 +1933,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     /* Prepare error message - find the smallesr index i that U(i,i)==0 */
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
 
 #if ( PROFlevel>=1 )
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 7792dc5f..c4d9dc83 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -356,7 +356,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int iinfo;
     if ( *info == 0 ) *info = n + 1;
     MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
-    if ( iinfo == n + 1 ) *info = 0;
+    if ( iinfo == (n + 1) ) *info = 0;
     else *info = iinfo;
     //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
 
diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c
index 051d6d2f..f5ea9593 100644
--- a/SRC/pzgstrs.c
+++ b/SRC/pzgstrs.c
@@ -849,7 +849,7 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     zLocalLU_t *Llu = LUstruct->Llu;
     doublecomplex alpha = {1.0, 0.0};
-	doublecomplex beta = {0.0, 0.0};
+    doublecomplex beta = {0.0, 0.0};
     doublecomplex zero = {0.0, 0.0};
     doublecomplex *lsum;  /* Local running sum of the updates to B-components */
     doublecomplex *x;     /* X component at step k. */
@@ -896,31 +896,31 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
 
     double tmax;
     	/*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
+    int  *fmod;         /* Modification count for L-solve --
     			 Count the number of local block products to
     			 be summed into lsum[lk]. */
-    int_t fmod_tmp;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  nfrecvx_buf=0;
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-    			 from processes in this row.
-    			 It is only valid on the diagonal processes. */
-    int_t  frecv_tmp;
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nfrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nbrecv = 0; /* Count of total messages to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-    int_t  nleaftmp = 0, nroottmp = 0;
+    int  fmod_tmp;
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  nfrecvx_buf=0;
+    int  *frecv;        /* Count of lsum[lk] contributions to be received
+    			     from processes in this row.
+    			     It is only valid on the diagonal processes. */
+    int  frecv_tmp;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nfrecv = 0; /* Count of total messages to be recv'd. */
+    int  nbrecv = 0; /* Count of total messages to be recv'd. */
+    int  nleaf = 0, nroot = 0;
+    int  nleaftmp = 0, nroottmp = 0;
     int_t  msgsize;
         /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  bmod_tmp;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  nbrecvx_buf=0;
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-    			 processes in this row. */
+    int  *bmod;         /* Modification count for U-solve. */
+    int  bmod_tmp;
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  nbrecvx_buf=0;
+    int  *brecv;        /* Count of modifications to be recv'd from
+    			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     int_t flagx,flaglsum,flag;
     int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups;
@@ -932,7 +932,7 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
 
     int_t gik,iklrow,fnz;
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
     int INFO, pad;
     int_t tmpresult;
 
@@ -950,22 +950,25 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
     int iword = sizeof (int_t);
     int dword = sizeof (doublecomplex);
     int Nwork;
-	int_t procs = grid->nprow * grid->npcol;
-    	yes_no_t done;
+    int_t procs = grid->nprow * grid->npcol;
+    yes_no_t done;
     yes_no_t startforward;
-    	int nbrow;
+    int nbrow;
     int_t  ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc;
     int_t  lptr1_tmp, idx_i, idx_v,m;
-    	int_t ready;
-    	int thread_id = 0;
+    int_t ready;
+    int thread_id = 0;
     yes_no_t empty;
     int_t sizelsum,sizertemp,aln_d,aln_i;
-    aln_d = ceil(CACHELINE/(double)dword);
-    aln_i = ceil(CACHELINE/(double)iword);
+    aln_d = 1; //ceil(CACHELINE/(double)dword);
+    aln_i = 1; //ceil(CACHELINE/(double)iword);
     int num_thread = 1;
 
-	maxsuper = sp_ienv_dist(3);
+    maxsuper = sp_ienv_dist(3);
 
+//#ifdef _OPENMP
+//#pragma omp threadprivate(thread_id)
+//#endif
 
 #ifdef _OPENMP
 #pragma omp parallel default(shared)
@@ -1029,10 +1032,10 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS. */
-    if ( !(fmod = intMalloc_dist(nlb*aln_i)) )
+    if ( !(fmod = int32Malloc_dist(nlb*aln_i)) )
 	ABORT("Malloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i];
-    if ( !(frecv = intCalloc_dist(nlb)) )
+    if ( !(frecv = int32Calloc_dist(nlb)) )
 	ABORT("Calloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -1063,9 +1066,9 @@ pzgstrs(int_t n, zLUstruct_t *LUstruct,
 #ifdef _OPENMP
     if ( !(lsum = (doublecomplex*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(doublecomplex))))
 	ABORT("Malloc fails for lsum[].");
-#pragma omp parallel default(shared) private(ii)
+#pragma omp parallel default(shared) private(ii,thread_id)
     {
-	int thread_id = omp_get_thread_num(); //mjc
+	thread_id = omp_get_thread_num(); //mjc
 	for (ii=0; ii0)nfrecvx_buf++;
 				if(LBtree_ptr[lk].destCnt_>0)nfrecvx_buf++;
 			}
+			//BcTree_allocateRequest(LBtree_ptr[lk],'z');
 		}
 	}
 
@@ -1172,7 +1179,8 @@ if(procs==1){
 	for (lk=0;lkinv == 1) { /* Diagonal is inverted. */
 
 #ifdef _OPENMP
-#pragma	omp	for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
+#pragma	omp for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
 #endif
-			for (jj=0;jj=1 )
-					TIC(t1);
+			TIC(t1);
 #endif
-					rtemp_loc = &rtemp[sizertemp* thread_id];
-
-
-					knsupc = SuperSize( k );
-					lk = LBi( k, grid );
+			rtemp_loc = &rtemp[sizertemp* thread_id];
 
-					ii = X_BLK( lk );
-					lk = LBj( k, grid ); /* Local block number, column-wise. */
-					lsub = Lrowind_bc_ptr[lk];
-					lusup = Lnzval_bc_ptr[lk];
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid );
 
-					nsupr = lsub[1];
+			ii = X_BLK( lk );
+			lk = LBj( k, grid ); /* Local block number, column-wise. */
+			lsub = Lrowind_bc_ptr[lk];
+			lusup = Lnzval_bc_ptr[lk];
 
-					Linv = Linv_bc_ptr[lk];
+			nsupr = lsub[1];
+			Linv = Linv_bc_ptr[lk];
 #ifdef _CRAY
-					CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
-							&alpha, Linv, &knsupc, &x[ii],
-							&knsupc, &beta, rtemp_loc, &knsupc );
+			CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc );
 #elif defined (USE_VENDOR_BLAS)
-					zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-							&alpha, Linv, &knsupc, &x[ii],
-							&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+			zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
 #else
-					zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-							&alpha, Linv, &knsupc, &x[ii],
-							&knsupc, &beta, rtemp_loc, &knsupc );
+			zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+					&alpha, Linv, &knsupc, &x[ii],
+					&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
 
+			for (i=0 ; i=1 )
-					TOC(t2, t1);
-					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 
 #endif
 
-					stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-					+ 10 * knsupc * nrhs; /* complex division */
-
-
-					// --nleaf;
+			stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+				+ 10 * knsupc * nrhs; /* complex division */
+			// --nleaf;
 #if ( DEBUGlevel>=2 )
-					printf("(%2d) Solve X[%2d]\n", iam, k);
+			printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
+			/*
+			 * Send Xk to process column Pc[k].
+			 */
 
-					/*
-					 * Send Xk to process column Pc[k].
-					 */
-
-					if(LBtree_ptr[lk].empty_==NO){
-						lib = LBi( k, grid ); /* Local block number, row-wise. */
-						ii = X_BLK( lib );
+			if(LBtree_ptr[lk].empty_==NO){
+				lib = LBi( k, grid ); /* Local block number, row-wise. */
+				ii = X_BLK( lib );
 
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
-						nleaf_send_tmp = ++nleaf_send;
-						leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
-						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
-					}
-				}
+				nleaf_send_tmp = ++nleaf_send;
+				leaf_send[(nleaf_send_tmp-1)*aln_i] = lk;
+				// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
 			}
+		    }
+	     }
 	} else { /* Diagonal is not inverted. */
 #ifdef _OPENMP
 #pragma	omp	for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait
@@ -1343,7 +1347,7 @@ thread_id=0;
 #if ( PROFlevel>=1 )
 		    TIC(t1);
 #endif
-			rtemp_loc = &rtemp[sizertemp* thread_id];
+		    rtemp_loc = &rtemp[sizertemp* thread_id];
 
 		    knsupc = SuperSize( k );
 		    lk = LBi( k, grid );
@@ -1396,71 +1400,64 @@ thread_id=0;
 		    }
 		    } /* end a block */
 		} /* end for jj ... */
-	    } /* end else ... diagonal is not invedted */
+	    } /* end else ... diagonal is not inverted */
 	  }
-	}
+	} /* end parallel region */
 
 	jj=0;
 
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
-	{
-#else
-	{
 #endif
-
+	{
 
 #ifdef _OPENMP
 #pragma omp master
 #endif
-				{
+	    {
 
 #ifdef _OPENMP
-#pragma	omp	taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
+#pragma	omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
 #endif
+		for (jj=0;jj=0){ // this is a bcast forwarding
-					gb = mycol+lk*grid->npcol;  /* not sure */
-					lib = LBi( gb, grid ); /* Local block number, row-wise. */
-					ii = X_BLK( lib );
-					// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-					C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
-								}else{ // this is a reduce forwarding
-					lk = -lk - 1;
-					il = LSUM_BLK( lk );
-					// RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
-					C_RdTree_forwardMessageSimple(&LRtree_ptr[lk],&lsum[il - LSUM_H ],LRtree_ptr[lk].msgSize_*nrhs+LSUM_H);
-				}
-			}
+	}
 
+	for (i=0;i=0){ // this is a bcast forwarding
+			gb = mycol+lk*grid->npcol;  /* not sure */
+			lib = LBi( gb, grid ); /* Local block number, row-wise. */
+			ii = X_BLK( lib );
+			//BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
+			C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
+			
+		}else{ // this is a reduce forwarding
+			lk = -lk - 1;
+			il = LSUM_BLK( lk );
+			//RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
+			C_RdTree_forwardMessageSimple(&LRtree_ptr[lk],&lsum[il - LSUM_H],LRtree_ptr[lk].msgSize_*nrhs+LSUM_H);
+			
+		}
+	}
 
 
 #ifdef USE_VTUNE
@@ -1468,413 +1465,392 @@ thread_id=0;
 	__SSC_MARK(0x222); // stop SDE tracing
 #endif
 
-			/* -----------------------------------------------------------
-			   Compute the internal nodes asynchronously by all processes.
-			   ----------------------------------------------------------- */
+	/* -----------------------------------------------------------
+	   Compute the internal nodes asynchronously by all processes.
+	   ----------------------------------------------------------- */
 
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
-			{
-	int thread_id = omp_get_thread_num();
+	{
+        int thread_id = omp_get_thread_num();
 #else
 	{
-	thread_id=0;
+	thread_id = 0;
 #endif
 #ifdef _OPENMP
 #pragma omp master
 #endif
-				{
-					for ( nfrecv =0; nfrecv=1 )
-						TIC(t1);
-						// msgcnt[1] = maxrecvsz;
+		   	TIC(t1);
+			// msgcnt[1] = maxrecvsz;
 #endif
 
-						recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
+			recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
 
-						/* Receive a message. */
-						MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-						// MPI_Irecv(recvbuf0,maxrecvsz,SuperLU_MPI_DOUBLE_COMPLEX,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
-						// ready=0;
-						// while(ready==0){
-						// MPI_Test(&req,&ready,&status);
-						// #pragma omp taskyield
-						// }
+			/* Receive a message. */
+			MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+				MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+			// MPI_Irecv(recvbuf0,maxrecvsz,SuperLU_MPI_DOUBLE_COMPLEX,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
+			// ready=0;
+			// while(ready==0){
+			// MPI_Test(&req,&ready,&status);
+			// #pragma omp taskyield
+			// }
 
 #if ( PROFlevel>=1 )
-						TOC(t2, t1);
-						stat_loc[thread_id]->utime[SOL_COMM] += t2;
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_COMM] += t2;
 
-						msg_cnt += 1;
-						msg_vol += maxrecvsz * dword;
+			msg_cnt += 1;
+			msg_vol += maxrecvsz * dword;
 #endif
 
-						{
+			{
 
-							k = (*recvbuf0).r;
+			k = (*recvbuf0).r;
 
 #if ( DEBUGlevel>=2 )
-							printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-
-							if(status.MPI_TAG==BC_L){
-								// --nfrecvx;
-								nfrecvx_buf++;
-								{
-									lk = LBj( k, grid );    /* local block number */
-
-									if(LBtree_ptr[lk].destCnt_>0){
-
-										// BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-										C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], recvbuf0, LBtree_ptr[lk].msgSize_*nrhs+XK_H);
-										// nfrecvx_buf++;
-									}
-
-									/*
-									 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-									 */
-
-									lk = LBj( k, grid ); /* Local block number, column-wise. */
-									lsub = Lrowind_bc_ptr[lk];
-									lusup = Lnzval_bc_ptr[lk];
-									if ( lsub ) {
-										krow = PROW( k, grid );
-										if(myrow==krow){
-											nb = lsub[0] - 1;
-											knsupc = SuperSize( k );
-											ii = X_BLK( LBi( k, grid ) );
-											xin = &x[ii];
-										}else{
-											nb   = lsub[0];
-											knsupc = SuperSize( k );
-											xin = &recvbuf0[XK_H] ;
-										}
-
-										zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
-												fmod, nb, xsup, grid, Llu,
-												stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
-
-									} /* if lsub */
-								}
-
-							}else if(status.MPI_TAG==RD_L){
-								// --nfrecvmod;
-								lk = LBi( k, grid ); /* Local block number, row-wise. */
-
-								knsupc = SuperSize( k );
-								tempv = &recvbuf0[LSUM_H];
-								il = LSUM_BLK( lk );
-								RHS_ITERATE(j) {
-									for (i = 0; i < knsupc; ++i)
-										z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
-											  &lsum[i + il + j*knsupc + thread_id*sizelsum],
-											  &tempv[i + j*knsupc]);
-
-								}
-
-								// #ifdef _OPENMP
-								// #pragma omp atomic capture
-								// #endif
-								fmod_tmp=--fmod[lk*aln_i];
-								{
-									thread_id = 0;
-									rtemp_loc = &rtemp[sizertemp* thread_id];
-									if ( fmod_tmp==0 ) {
-										if(C_RdTree_IsRoot(&LRtree_ptr[lk])==YES){
-											// ii = X_BLK( lk );
-											knsupc = SuperSize( k );
-											for (ii=1;ii0){
+
+					//BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
+					C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], recvbuf0, LBtree_ptr[lk].msgSize_*nrhs+XK_H);
+					// nfrecvx_buf++;
+				}
+
+				/*
+				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+				 */
+
+				lk = LBj( k, grid ); /* Local block number, column-wise. */
+				lsub = Lrowind_bc_ptr[lk];
+				lusup = Lnzval_bc_ptr[lk];
+				if ( lsub ) {
+					krow = PROW( k, grid );
+					if(myrow==krow){
+					    nb = lsub[0] - 1;
+					    knsupc = SuperSize( k );
+					    ii = X_BLK( LBi( k, grid ) );
+					    xin = &x[ii];
+				        }else{
+					    nb   = lsub[0];
+					    knsupc = SuperSize( k );
+					    xin = &recvbuf0[XK_H] ;
+					}
+					zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, 
+					    fmod, nb, xsup, grid, Llu, 
+					    stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
+
+				} /* if lsub */
+			    }
+			}else if(status.MPI_TAG==RD_L){
+				// --nfrecvmod;
+				lk = LBi( k, grid ); /* Local block number, row-wise. */
+
+				knsupc = SuperSize( k );
+				tempv = &recvbuf0[LSUM_H];
+				il = LSUM_BLK( lk );
+				RHS_ITERATE(j) {
+				for (i = 0; i < knsupc; ++i)
+					z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum], &lsum[i + il + j*knsupc + thread_id*sizelsum], &tempv[i + j*knsupc]);
+				} 
+
+			// #ifdef _OPENMP
+			// #pragma omp atomic capture
+			// #endif
+				fmod_tmp=--fmod[lk*aln_i];
+				{
+				thread_id = 0;
+				rtemp_loc = &rtemp[sizertemp* thread_id];
+				if ( fmod_tmp==0 ) {
+				    if(C_RdTree_IsRoot(&LRtree_ptr[lk])==YES){
+				    // ii = X_BLK( lk );
+					knsupc = SuperSize( k );
+					for (ii=1;ii=1 )
-											TIC(t1);
+					TIC(t1);
 #endif
-
-											if(Llu->inv == 1){
-												Linv = Linv_bc_ptr[lk];
+					if(Llu->inv == 1){
+						Linv = Linv_bc_ptr[lk];
 #ifdef _CRAY
-												CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
-														&alpha, Linv, &knsupc, &x[ii],
-														&knsupc, &beta, rtemp_loc, &knsupc );
+						CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Linv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
 #elif defined (USE_VENDOR_BLAS)
-												zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-														&alpha, Linv, &knsupc, &x[ii],
-														&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+						zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+							&alpha, Linv, &knsupc, &x[ii],
+							&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
 #else
-												zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-														&alpha, Linv, &knsupc, &x[ii],
-														&knsupc, &beta, rtemp_loc, &knsupc );
+						zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+							&alpha, Linv, &knsupc, &x[ii],
+							&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
-
-												for (i=0 ; iinnv == 0 */
 #ifdef _CRAY
-												CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-														lusup, &nsupr, &x[ii], &knsupc);
+					    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
+					    lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
-														lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+					    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+					      lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-												ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
-														lusup, &nsupr, &x[ii], &knsupc);
+					    ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
+					        lusup, &nsupr, &x[ii], &knsupc);
 #endif
-											}
+					} /* end if-else */
 
 #if ( PROFlevel>=1 )
-											TOC(t2, t1);
-											stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+					TOC(t2, t1);
+					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 #endif
 
-											stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-											+ 10 * knsupc * nrhs; /* complex division */
+					stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+					+ 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
-											printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-
-											/*
-											 * Send Xk to process column Pc[k].
-											 */
-											if(LBtree_ptr[lk].empty_==NO){
-												// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-												C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
-											}
-
-
-											/*
-											 * Perform local block modifications.
-											 */
-											lk = LBj( k, grid ); /* Local block number, column-wise. */
-											lsub = Lrowind_bc_ptr[lk];
-											lusup = Lnzval_bc_ptr[lk];
-											if ( lsub ) {
-												krow = PROW( k, grid );
-												nb = lsub[0] - 1;
-												knsupc = SuperSize( k );
-												ii = X_BLK( LBi( k, grid ) );
-												xin = &x[ii];
-												zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
-														fmod, nb, xsup, grid, Llu,
-														stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread);
-											} /* if lsub */
-											// }
-
-									}else{
-
-										il = LSUM_BLK( lk );
-										knsupc = SuperSize( k );
-
-										for (ii=1;ii=2 )
-		t = SuperLU_timer_() - t;
-		stat->utime[SOL_TOT] += t;
-		if ( !iam ) {
-			printf(".. L-solve time\t%8.4f\n", t);
-			fflush(stdout);
-		}
+				    }else{ /* fmod_tmp != 0 */
+					il = LSUM_BLK( lk );
+					knsupc = SuperSize( k );
+					for (ii=1;iicomm);
-		if ( !iam ) {
-			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);
-			fflush(stdout);
-		}
+				} /* end else MPI_TAG==RD_L */
+			    } /* check Tag */
+			} /* end for nfrecv ... */
+                    } /* while not finished ... */
+       	    }
+        } // end of parallel
 
+#if ( PRNTlevel>=2 )
+	t = SuperLU_timer_() - t;
+	stat->utime[SOL_TOT] += t;
+	if ( !iam ) {
+		printf(".. L-solve time\t%8.4f\n", t);
+		fflush(stdout);
+	}
 
-		t = SuperLU_timer_();
+	MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm);
+	if ( !iam ) {
+		printf(".. L-solve time (MAX) \t%8.4f\n", tmax);
+		fflush(stdout);
+	}
+	t = SuperLU_timer_();
 #endif
 
-
 #if ( DEBUGlevel==2 )
-		{
-			printf("(%d) .. After L-solve: y =\n", iam);
-			for (i = 0, k = 0; k < nsupers; ++k) {
-				krow = PROW( k, grid );
-				kcol = PCOL( k, grid );
-				if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-					knsupc = SuperSize( k );
-					lk = LBi( k, grid );
-					ii = X_BLK( lk );
-					for (j = 0; j < knsupc; ++j)
-						printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+	{
+		printf("(%d) .. After L-solve: y =\n", iam);
+		for (i = 0, k = 0; k < nsupers; ++k) {
+			krow = PROW( k, grid );
+			kcol = PCOL( k, grid );
+			if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+				knsupc = SuperSize( k );
+				lk = LBi( k, grid );
+				ii = X_BLK( lk );
+				for (j = 0; j < knsupc; ++j)
+					printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
 					fflush(stdout);
 				}
-				MPI_Barrier( grid->comm );
-			}
+			MPI_Barrier( grid->comm );
 		}
+	}
 #endif
 
-		SUPERLU_FREE(fmod);
-		SUPERLU_FREE(frecv);
-		SUPERLU_FREE(leaf_send);
-		SUPERLU_FREE(leafsups);
-		SUPERLU_FREE(recvbuf_BC_fwd);
-		log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword*2.0, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
+	SUPERLU_FREE(fmod);
+	SUPERLU_FREE(frecv);
+	SUPERLU_FREE(leaf_send);
+	SUPERLU_FREE(leafsups);
+	SUPERLU_FREE(recvbuf_BC_fwd);
+	log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword*2.0, stat);	//account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd
 
-		for (lk=0;lkcomm );
+	}
+	MPI_Barrier( grid->comm );
 
 #if ( VAMPIR>=1 )
-		VT_traceoff();
-		VT_finalize();
+	VT_traceoff();
+	VT_finalize();
 #endif
 
 
-		/*---------------------------------------------------
-		 * Back solve Ux = y.
-		 *
-		 * The Y components from the forward solve is already
-		 * on the diagonal processes.
+	/*---------------------------------------------------
+	 * Back solve Ux = y.
+	 *
+	 * The Y components from the forward solve is already
+	 * on the diagonal processes.
 	 *---------------------------------------------------*/
 
+	/* Save the count to be altered so it can be used by
+	   subsequent call to PDGSTRS. */
+	if ( !(bmod = int32Malloc_dist(nlb*aln_i)) )
+		ABORT("Malloc fails for bmod[].");
+	for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i];
+	if ( !(brecv = int32Calloc_dist(nlb)) )
+		ABORT("Calloc fails for brecv[].");
+	Llu->brecv = brecv;
 
-		/* Save the count to be altered so it can be used by
-		   subsequent call to PDGSTRS. */
-		if ( !(bmod = intMalloc_dist(nlb*aln_i)) )
-			ABORT("Malloc fails for bmod[].");
-		for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i];
-		if ( !(brecv = intCalloc_dist(nlb)) )
-			ABORT("Calloc fails for brecv[].");
-		Llu->brecv = brecv;
+	k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
 
-		k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-
-		/* Re-initialize lsum to zero. Each block header is already in place. */
+	/* Re-initialize lsum to zero. Each block header is already in place. */
 
 #ifdef _OPENMP
 
-#pragma omp parallel default(shared) private(ii,thread_id)
+#pragma omp parallel default(shared) private(ii)
 	{
-		int thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 		for(ii=0;ii=2 )
-		for (p = 0; p < Pr*Pc; ++p) {
-			if (iam == p) {
-				printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-				for (lb = 0; lb < nub; ++lb) {
-					printf("(%2d) Local col %2d: # row blocks %2d\n",
-							iam, lb, Urbs[lb]);
-					if ( Urbs[lb] ) {
-						for (i = 0; i < Urbs[lb]; ++i)
-							printf("(%2d) .. row blk %2d:\
-									lbnum %d, indpos %d, valpos %d\n",
-									iam, i,
-									Ucb_indptr[lb][i].lbnum,
-									Ucb_indptr[lb][i].indpos,
-									Ucb_valptr[lb][i]);
-					}
-				}
-			}
-			MPI_Barrier( grid->comm );
+	for (p = 0; p < Pr*Pc; ++p) {
+	    if (iam == p) {
+		printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+		for (lb = 0; lb < nub; ++lb) {
+		    printf("(%2d) Local col %2d: # row blocks %2d\n",
+				iam, lb, Urbs[lb]);
+		    if ( Urbs[lb] ) {
+			for (i = 0; i < Urbs[lb]; ++i)
+			    printf("(%2d) .. row blk %2d:\
+				    lbnum %d, indpos %d, valpos %d\n",
+				    iam, i,
+				    Ucb_indptr[lb][i].lbnum,
+				    Ucb_indptr[lb][i].indpos,
+				    Ucb_valptr[lb][i]);
+		     }
 		}
-		for (p = 0; p < Pr*Pc; ++p) {
-			if ( iam == p ) {
-				printf("\n(%d) bsendx_plist[][]", iam);
-				for (lb = 0; lb < nub; ++lb) {
-					printf("\n(%d) .. local col %2d: ", iam, lb);
-					for (i = 0; i < Pr; ++i)
-						printf("%4d", bsendx_plist[lb][i]);
-				}
-				printf("\n");
+	    }
+	    MPI_Barrier( grid->comm );
+	}
+	for (p = 0; p < Pr*Pc; ++p) {
+		if ( iam == p ) {
+			printf("\n(%d) bsendx_plist[][]", iam);
+			for (lb = 0; lb < nub; ++lb) {
+				printf("\n(%d) .. local col %2d: ", iam, lb);
+				for (i = 0; i < Pr; ++i)
+					printf("%4d", bsendx_plist[lb][i]);
 			}
-			MPI_Barrier( grid->comm );
+			printf("\n");
 		}
+		MPI_Barrier( grid->comm );
+	}
 #endif /* DEBUGlevel */
 
-
-
-
 	/* ---------------------------------------------------------
 	   Initialize the async Bcast trees on all processes.
 	   --------------------------------------------------------- */
@@ -1882,14 +1858,14 @@ thread_id=0;
 
 	nbtree = 0;
 	for (lk=0;lk0)nbrecvx_buf++;
-			}
-			// BcTree_allocateRequest(UBtree_ptr[lk],'d');
+	    if(UBtree_ptr[lk].empty_==NO){
+		// printf("UBtree_ptr lk %5d\n",lk);
+		if(C_BcTree_IsRoot(&UBtree_ptr[lk])==NO){
+			nbtree++;
+			if(UBtree_ptr[lk].destCnt_>0)nbrecvx_buf++;
 		}
+		//BcTree_allocateRequest(UBtree_ptr[lk],'z');
+	    }
 	}
 
 	nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
@@ -1903,7 +1879,7 @@ thread_id=0;
 			// printf("here lk %5d myid %5d\n",lk,iam);
 			// fflush(stdout);
 			nrtree++;
-			// RdTree_allocateRequest(URtree_ptr[lk],'z');
+			//RdTree_allocateRequest(URtree_ptr[lk],'z');
 			brecv[lk] = URtree_ptr[lk].destCnt_;
 			nbrecvmod += brecv[lk];
 		}else{
@@ -1923,7 +1899,6 @@ thread_id=0;
 	for (i = 0; i < nlb; ++i) bmod[i*aln_i] += brecv[i];
 	// for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]);
 
-
 	if ( !(recvbuf_BC_fwd = (doublecomplex*)SUPERLU_MALLOC(maxrecvsz*(nbrecvx+1) * sizeof(doublecomplex))) )  // this needs to be optimized for 1D row mapping
 		ABORT("Malloc fails for recvbuf_BC_fwd[].");
 	nbrecvx_buf=0;
@@ -1945,45 +1920,37 @@ thread_id=0;
 	t = SuperLU_timer_();
 #endif
 
-		/*
-		 * Solve the roots first by all the diagonal processes.
-		 */
+	/*
+	 * Solve the roots first by all the diagonal processes.
+	 */
 #if ( DEBUGlevel>=2 )
-		printf("(%2d) nroot %4d\n", iam, nroot);
-		fflush(stdout);
+	printf("(%2d) nroot %4d\n", iam, nroot);
+	fflush(stdout);
 #endif
 
-
-
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
-	{
-#else
-	{
 #endif
+	{
 #ifdef _OPENMP
 #pragma omp master
 #endif
-		{
+	    {
 #ifdef _OPENMP
-#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp,thread_id) nogroup
+#pragma	omp taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp,thread_id) nogroup
 #endif
 		for (jj=0;jj=1 )
 			TIC(t1);
 #endif
 #ifdef _OPENMP
-			thread_id=omp_get_thread_num();
+			thread_id = omp_get_thread_num ();
 #else
-			thread_id=0;
+			thread_id = 0;
 #endif
-
 			rtemp_loc = &rtemp[sizertemp* thread_id];
 
-
-
 			knsupc = SuperSize( k );
 			lk = LBi( k, grid ); /* Local block number, row-wise. */
 
@@ -1994,7 +1961,6 @@ thread_id=0;
 			lusup = Lnzval_bc_ptr[lk];
 			nsupr = lsub[1];
 
-
 			if(Llu->inv == 1){
 
 				Uinv = Uinv_bc_ptr[lk];
@@ -2011,7 +1977,6 @@ thread_id=0;
 						&alpha, Uinv, &knsupc, &x[ii],
 						&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
-
 				for (i=0 ; inpcol;  /* not sure */
 		lib = LBi( gb, grid ); /* Local block number, row-wise. */
 		ii = X_BLK( lib );
-		// BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
+		//BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 		C_BcTree_forwardMessageSimple(&UBtree_ptr[lk], &x[ii - XK_H], UBtree_ptr[lk].msgSize_*nrhs+XK_H);
 	}else{ // this is a reduce forwarding
 		lk = -lk - 1;
 		il = LSUM_BLK( lk );
-		// RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
+		//RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'z')*nrhs+LSUM_H,'z');
 		C_RdTree_forwardMessageSimple(&URtree_ptr[lk],&lsum[il - LSUM_H ],URtree_ptr[lk].msgSize_*nrhs+LSUM_H);
 	}
 }
 
-
-		/*
-		 * Compute the internal nodes asychronously by all processes.
-		 */
+	/*
+	 * Compute the internal nodes asychronously by all processes.
+	 */
 
 #ifdef _OPENMP
 #pragma omp parallel default (shared)
 	{
-	int thread_id=omp_get_thread_num();
-#else
-	{
-	thread_id=0;
+	    int thread_id=omp_get_thread_num();
+#else 
+        {
+	    thread_id = 0;
 #endif
 #ifdef _OPENMP
 #pragma omp master
 #endif
-		for ( nbrecv =0; nbrecv=1 )
 			TIC(t1);
 #endif
 
-			recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
+		recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
 
-			/* Receive a message. */
-			MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+		/* Receive a message. */
+		MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+			MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
 
 #if ( PROFlevel>=1 )
 			TOC(t2, t1);
 			stat_loc[thread_id]->utime[SOL_COMM] += t2;
-
 			msg_cnt += 1;
 			msg_vol += maxrecvsz * dword;
 #endif
 
-			k = (*recvbuf0).r;
+		k = (*recvbuf0).r;
 #if ( DEBUGlevel>=2 )
-			printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-			fflush(stdout);
+		printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+		fflush(stdout);
 #endif
+		if(status.MPI_TAG==BC_U){
+		    // --nfrecvx;
+		    nbrecvx_buf++;
+		    lk = LBj( k, grid );    /* local block number */
+		    if(UBtree_ptr[lk].destCnt_>0){
 
-			if(status.MPI_TAG==BC_U){
-				// --nfrecvx;
-				nbrecvx_buf++;
-
-				lk = LBj( k, grid );    /* local block number */
-
-				if(UBtree_ptr[lk].destCnt_>0){
-
-					// BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-					C_BcTree_forwardMessageSimple(&UBtree_ptr[lk], recvbuf0, UBtree_ptr[lk].msgSize_*nrhs+XK_H);
-					// nfrecvx_buf++;
-				}
-
-				/*
-				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-				 */
-
-				lk = LBj( k, grid ); /* Local block number, column-wise. */
-				zlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,
-						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-						stat_loc, sizelsum,sizertemp,thread_id,num_thread);
-			}else if(status.MPI_TAG==RD_U){
-
-				lk = LBi( k, grid ); /* Local block number, row-wise. */
-
-				knsupc = SuperSize( k );
-				tempv = &recvbuf0[LSUM_H];
-				il = LSUM_BLK( lk );
-				RHS_ITERATE(j) {
-
-					for (i = 0; i < knsupc; ++i)
-						z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
-							  &lsum[i + il + j*knsupc + thread_id*sizelsum],
-							  &tempv[i + j*knsupc]);
-
-				}
-			// #ifdef _OPENMP
-			// #pragma omp atomic capture
-			// #endif
-				bmod_tmp=--bmod[lk*aln_i];
-				thread_id = 0;
-				rtemp_loc = &rtemp[sizertemp* thread_id];
-				if ( bmod_tmp==0 ) {
-					if(C_RdTree_IsRoot(&URtree_ptr[lk])==YES){
-
-						knsupc = SuperSize( k );
-						for (ii=1;iiinv == 1){
+		    knsupc = SuperSize( k );
+		    tempv = &recvbuf0[LSUM_H];
+		    il = LSUM_BLK( lk );
+		    RHS_ITERATE(j) {
+			for (i = 0; i < knsupc; ++i)
+			    z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
+				  &lsum[i + il + j*knsupc + thread_id*sizelsum],
+				  &tempv[i + j*knsupc]);
+		    }
+		// #ifdef _OPENMP
+		// #pragma omp atomic capture
+		// #endif
+		    bmod_tmp=--bmod[lk*aln_i];
+		    thread_id = 0;
+		    rtemp_loc = &rtemp[sizertemp* thread_id];
+		    if ( bmod_tmp==0 ) {
+			//if(RdTree_IsRoot(URtree_ptr[lk],'z')==YES){
+			if(C_RdTree_IsRoot(&URtree_ptr[lk])==YES){
+
+			    knsupc = SuperSize( k );
+			    for (ii=1;iiinv == 1){
 
-							Uinv = Uinv_bc_ptr[lk];
+				Uinv = Uinv_bc_ptr[lk];
 
 #ifdef _CRAY
-							CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
-									&alpha, Uinv, &knsupc, &x[ii],
-									&knsupc, &beta, rtemp_loc, &knsupc );
+				CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
 #elif defined (USE_VENDOR_BLAS)
-							zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-									&alpha, Uinv, &knsupc, &x[ii],
-									&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+				zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
 #else
-							zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
-									&alpha, Uinv, &knsupc, &x[ii],
-									&knsupc, &beta, rtemp_loc, &knsupc );
+				zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
 #endif
 
-
-							for (i=0 ; iinv == 0 */
 #ifdef _CRAY
-							CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-									lusup, &nsupr, &x[ii], &knsupc);
+				CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
+						lusup, &nsupr, &x[ii], &knsupc);
 #elif defined (USE_VENDOR_BLAS)
-							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
-									lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+					lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
 #else
-							ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
-									lusup, &nsupr, &x[ii], &knsupc);
+				ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
+					lusup, &nsupr, &x[ii], &knsupc);
 #endif
-						}
+			    }
 
 #if ( PROFlevel>=1 )
-							TOC(t2, t1);
-							stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+			    TOC(t2, t1);
+			    stat_loc[thread_id]->utime[SOL_TRSM] += t2;
 #endif
-							stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-							+ 10 * knsupc * nrhs; /* complex division */
+			    stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+					+ 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
-						printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-
-						/*
-						 * Send Xk to process column Pc[k].
-						 */
-						if(UBtree_ptr[lk].empty_==NO){
-							// BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'d')*nrhs+XK_H,'d');
-							C_BcTree_forwardMessageSimple(&UBtree_ptr[lk], &x[ii - XK_H], UBtree_ptr[lk].msgSize_*nrhs+XK_H);
-						}
-
-
-						/*
-						 * Perform local block modifications:
-						 *         lsum[i] -= U_i,k * X[k]
-						 */
-						if ( Urbs[lk] )
-							zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,
-									Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-									stat_loc, sizelsum,sizertemp,thread_id,num_thread);
-
-					}else{
-						il = LSUM_BLK( lk );
-						knsupc = SuperSize( k );
-
-						for (ii=1;ii=2 )
-		t = SuperLU_timer_() - t;
-		stat->utime[SOL_TOT] += t;
-		if ( !iam ) printf(".. U-solve time\t%8.4f\n", t);
-		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
-				MPI_MAX, 0, grid->comm);
-		if ( !iam ) {
-			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);
-			fflush(stdout);
-		}
-		t = SuperLU_timer_();
+	t = SuperLU_timer_() - t;
+	stat->utime[SOL_TOT] += t;
+	if ( !iam ) printf(".. U-solve time\t%8.4f\n", t);
+	MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm);
+	if ( !iam ) {
+		printf(".. U-solve time (MAX) \t%8.4f\n", tmax);
+		fflush(stdout);
+	}
+	t = SuperLU_timer_();
 #endif
 
-
 #if ( DEBUGlevel>=2 )
-		{
-			double *x_col;
-			int diag;
-			printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
-			ii = 0;
-			for (k = 0; k < nsupers; ++k) {
-				knsupc = SuperSize( k );
-				krow = PROW( k, grid );
-				kcol = PCOL( k, grid );
-				diag = PNUM( krow, kcol, grid);
-				if ( iam == diag ) { /* Diagonal process. */
-					lk = LBi( k, grid );
-					jj = X_BLK( lk );
-					x_col = &x[jj];
-					RHS_ITERATE(j) {
-						for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
-							printf("\t(%d)\t%4d\t%.10f\n",
-									iam, xsup[k]+i, x_col[i]);
-						}
-						x_col += knsupc;
-					}
-				}
-				ii += knsupc;
-			} /* for k ... */
+        {
+	    doublecomplex *x_col;
+	    int diag;
+	    printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
+	    ii = 0;
+	    for (k = 0; k < nsupers; ++k) {
+	    	knsupc = SuperSize( k );
+		krow = PROW( k, grid );
+		kcol = PCOL( k, grid );
+		diag = PNUM( krow, kcol, grid);
+		if ( iam == diag ) { /* Diagonal process. */
+		   lk = LBi( k, grid );
+		   jj = X_BLK( lk );
+		   x_col = &x[jj];
+		   RHS_ITERATE(j) {
+		       for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
+			   printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+i, x_col[i]);
+		       }      
+		       x_col += knsupc;
+		   }
 		}
+		ii += knsupc;
+	    } /* for k ... */
+	}
 #endif
 
-		pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+	pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
 				ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
-
 #if ( PRNTlevel>=2 )
-		t = SuperLU_timer_() - t;
-		if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
-		t = SuperLU_timer_();
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
+	t = SuperLU_timer_();
 #endif
 
-
-		double tmp1=0;
-		double tmp2=0;
-		double tmp3=0;
-		double tmp4=0;
-		for(i=0;iutime[SOL_TRSM]);
-			tmp2 = SUPERLU_MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]);
-			tmp3 = SUPERLU_MAX(tmp3,stat_loc[i]->utime[SOL_COMM]);
-			tmp4 += stat_loc[i]->ops[SOLVE];
+	double tmp1=0;
+	double tmp2=0;
+	double tmp3=0;
+	double tmp4=0;
+	for(i=0;iutime[SOL_TRSM]);
+		tmp2 = SUPERLU_MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]);
+		tmp3 = SUPERLU_MAX(tmp3,stat_loc[i]->utime[SOL_COMM]);
+		tmp4 += stat_loc[i]->ops[SOLVE];
 #if ( PRNTlevel>=2 )
-			if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
+		if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
 #endif
-		}
-
-
-		stat->utime[SOL_TRSM] += tmp1;
-		stat->utime[SOL_GEMM] += tmp2;
-		stat->utime[SOL_COMM] += tmp3;
-		stat->ops[SOLVE]+= tmp4;
-
+	}
 
-		/* Deallocate storage. */
-		for(i=0;iutime[SOL_TRSM] += tmp1;
+	stat->utime[SOL_GEMM] += tmp2;
+	stat->utime[SOL_COMM] += tmp3;
+	stat->ops[SOLVE]+= tmp4;
 
+	/* Deallocate storage. */
+	for(i=0;icomm );
+	}
+	MPI_Barrier( grid->comm );
+
 
-		//		if (!iam) { printf("DBG: pzgstrs: after Barrier\n"); fflush(stdout);}
 #if ( PROFlevel>=2 )
-		{
-			float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
-
-			MPI_Reduce (&msg_cnt, &msg_cnt_sum,
-					1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
-			MPI_Reduce (&msg_cnt, &msg_cnt_max,
-					1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
-			MPI_Reduce (&msg_vol, &msg_vol_sum,
-					1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
-			MPI_Reduce (&msg_vol, &msg_vol_max,
-					1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
-			if (!iam) {
-				printf ("\tPDGSTRS comm stat:"
-						"\tAvg\tMax\t\tAvg\tMax\n"
-						"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
-						msg_cnt_sum / Pr / Pc, msg_cnt_max,
-						msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
-			}
+	{
+		float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+		MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+				1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&msg_cnt, &msg_cnt_max,
+				1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		MPI_Reduce (&msg_vol, &msg_vol_sum,
+				1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+		MPI_Reduce (&msg_vol, &msg_vol_max,
+				1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+		if (!iam) {
+			printf ("\tPDGSTRS comm stat:"
+				"\tAvg\tMax\t\tAvg\tMax\n"
+				"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+				msg_cnt_sum / Pr / Pc, msg_cnt_max,
+				msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
 		}
+	}
 #endif
 
     stat->utime[SOLVE] = SuperLU_timer_() - t1_sol;
@@ -2445,31 +2386,27 @@ for (i=0;i=2 )
-	    float for_lu, total, max, avg, temp;
-		superlu_dist_mem_usage_t num_mem_usage;
-
-	    zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
-	    temp = num_mem_usage.total;
-
-	    MPI_Reduce( &temp, &max,
-		       1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
-	    MPI_Reduce( &temp, &avg,
-		       1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
-            if (!iam) {
-		printf("\n** Memory Usage **********************************\n");
-                printf("** Total highmark (MB):\n"
-		       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
-		       avg * 1e-6,
-		       avg / grid->nprow / grid->npcol * 1e-6,
-		       max * 1e-6);
-		printf("**************************************************\n");
-		fflush(stdout);
-            }
+    float for_lu, total, max, avg, temp;
+    superlu_dist_mem_usage_t num_mem_usage;
+
+    zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+    temp = num_mem_usage.total;
+
+    MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
+    MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+    if (!iam) {
+	printf("\n** Memory Usage **********************************\n");
+        printf("** Total highmark (MB):\n"
+	       "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+	       avg * 1e-6,
+	       avg / grid->nprow / grid->npcol * 1e-6,
+	       max * 1e-6);
+	printf("**************************************************\n");
+	fflush(stdout);
+    }
 #endif
 
-
     return;
 } /* PZGSTRS */
 
diff --git a/SRC/pzgstrs1.c b/SRC/pzgstrs1.c
index 05b57b2c..dafb741c 100644
--- a/SRC/pzgstrs1.c
+++ b/SRC/pzgstrs1.c
@@ -109,7 +109,8 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
     Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
     int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
     int    iam, kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, k, lb, ljb, lk, lptr, luptr;
+    int    i, ii, j, k, lb, ljb, lk;
+    int_t  il, lptr, luptr;
     int_t  nb, nlb, nub, nsupers;
     int_t  *xsup, *lsub, *usub;
     int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
@@ -125,19 +126,19 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 #endif
 
     /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -145,7 +146,7 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -179,10 +180,10 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PZGSTRS1. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -250,11 +251,12 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
 	    }
 	}
-	/*PrintInt10("mod_bit", nlb, mod_bit);*/
+	/*PrintInt32("mod_bit", nlb, mod_bit);*/
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	//MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -530,10 +532,10 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PZGSTRS1. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -557,7 +559,11 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -585,8 +591,13 @@ void pzgstrs1(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 		if ( mycol != kcol && bmod[lk] )
 		    i = 1;  /* Contribution from non-diagonal process. */
 		else i = 0;
+#if 0 // Sherry		
 		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
 			   MPI_SUM, kcol, scp->comm );
+#else			   
+		MPI_Reduce( &i, &brecv[lk], 1, MPI_INT, MPI_SUM, kcol, scp->comm );
+#endif
+
 		if ( mycol == kcol ) { /* Diagonal process. */
 		    nbrecvmod += brecv[lk];
 		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
diff --git a/SRC/pzgstrs_Bglobal.c b/SRC/pzgstrs_Bglobal.c
index 1d50615f..4a940ca3 100644
--- a/SRC/pzgstrs_Bglobal.c
+++ b/SRC/pzgstrs_Bglobal.c
@@ -134,19 +134,19 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 #endif
 
     /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of modifications to be recv'd from
+    int  *fmod;         /* Modification count for L-solve. */
+    int  **fsendx_plist = Llu->fsendx_plist;
+    int  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+    int  *frecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
+    int  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+    int  nleaf = 0, nroot = 0;
 
     /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for L-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
+    int  *bmod;         /* Modification count for L-solve. */
+    int  **bsendx_plist = Llu->bsendx_plist;
+    int  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+    int  *brecv;        /* Count of modifications to be recv'd from
 			     processes in this row. */
     int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
     double t;
@@ -154,7 +154,7 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
     int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+    int *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
 
     t = SuperLU_timer_();
 
@@ -189,10 +189,10 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(fmod = intMalloc_dist(nlb)) )
+    if ( !(fmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
@@ -277,7 +277,11 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif	
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
@@ -570,10 +574,10 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
     /* Save the count to be altered so it can be used by
        subsequent call to PDGSTRS_BGLOBAL. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
+    if ( !(bmod = int32Malloc_dist(nlb)) )
 	ABORT("Calloc fails for bmod[].");
     for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
+    if ( !(brecv = int32Malloc_dist(nlb)) )
 	ABORT("Malloc fails for brecv[].");
     Llu->brecv = brecv;
 
@@ -597,7 +601,11 @@ pzgstrs_Bglobal(int_t n, zLUstruct_t *LUstruct, gridinfo_t *grid,
 
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
+#if 0	   
 	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+#else	
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, scp->comm );
+#endif
 
 	for (k = 0; k < nsupers; ++k) {
 	    krow = PROW( k, grid );
diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c
index 4e2e52a9..fdfe261d 100644
--- a/SRC/pzgstrs_lsum.c
+++ b/SRC/pzgstrs_lsum.c
@@ -66,7 +66,7 @@ void zlsum_fmod
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t lptr,      /* Starting position in lsub[*].                      */
  int_t luptr,     /* Starting position in lusup[*].                     */
@@ -84,8 +84,8 @@ void zlsum_fmod
     int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel;
     int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -253,7 +253,7 @@ void zlsum_bmod
  doublecomplex *xk,          /* X[k].                                          */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -279,8 +279,8 @@ void zlsum_bmod
     int_t  *lsub;
     doublecomplex *lusup;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *brecv = Llu->brecv;
-    int_t  **bsendx_plist = Llu->bsendx_plist;
+    int  *brecv = Llu->brecv;
+    int    **bsendx_plist = Llu->bsendx_plist;
     MPI_Status status;
     int test_flag;
 
@@ -431,7 +431,7 @@ void zlsum_fmod_inv
  doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
  int   nrhs,      /* Number of right-hand sides.                        */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t *xsup,
  gridinfo_t *grid,
  zLocalLU_t *Llu,
@@ -454,8 +454,8 @@ void zlsum_fmod_inv
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	flops_t ops_loc=0.0;
@@ -472,9 +472,9 @@ void zlsum_fmod_inv
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 	int   knsupc;    /* Size of supernode k.                               */
 	int_t nlb;       /* Number of L blocks.                                */
 
@@ -733,7 +733,7 @@ void zlsum_fmod_inv
 							 * Send Xk to process column Pc[k].
 							 */
 
-								if(LBtree_ptr[lk].empty_==NO){
+							if(LBtree_ptr[lk].empty_==NO){
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -931,12 +931,11 @@ void zlsum_fmod_inv
 					printf("(%2d) Solve X[%2d]\n", iam, ik);
 #endif
 
-						/*
-						 * Send Xk to process column Pc[k].
-						 */
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
 
 					if(LBtree_ptr[lk].empty_==NO){
-
 #ifdef _OPENMP
 #pragma omp atomic capture
 #endif
@@ -991,7 +990,7 @@ void zlsum_fmod_inv_master
  int   nrhs,      /* Number of right-hand sides.                        */
  int   knsupc,    /* Size of supernode k.                               */
  int_t k,         /* The k-th component of X.                           */
- int_t *fmod,     /* Modification count for L-solve.                    */
+ int *fmod,     /* Modification count for L-solve.                    */
  int_t nlb,       /* Number of L blocks.                                */
  int_t *xsup,
  gridinfo_t *grid,
@@ -1013,8 +1012,8 @@ void zlsum_fmod_inv_master
 	int_t  i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready;
 	int_t  *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-    int_t  *frecv = Llu->frecv;
-    int_t  **fsendx_plist = Llu->fsendx_plist;
+    int  *frecv = Llu->frecv;
+    int  **fsendx_plist = Llu->fsendx_plist;
 	int_t  luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n,  idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder;
 	int thread_id1;
 	int m;
@@ -1032,9 +1031,9 @@ void zlsum_fmod_inv_master
 	int_t luptr;     /* Starting position in lusup[*].                     */
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 	ldalsum=Llu->ldalsum;
 
@@ -1359,10 +1358,11 @@ void zlsum_fmod_inv_master
 					 * Send Xk to process column Pc[k].
 					 */
 
-					if(LBtree_ptr[lk].empty_==NO){
-						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
+					if(LBtree_ptr[lk].empty_==NO) {
+						//BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'z')*nrhs+XK_H,'z');
 						C_BcTree_forwardMessageSimple(&LBtree_ptr[lk], &x[ii - XK_H], LBtree_ptr[lk].msgSize_*nrhs+XK_H);
 					}
+
 					/*
 					 * Perform local block modifications.
 					 */
@@ -1399,7 +1399,7 @@ void zlsum_bmod_inv
  doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -1429,13 +1429,13 @@ void zlsum_bmod_inv
 	int_t  *lsub;
 	doublecomplex *lusup;
 	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
+	int  *brecv = Llu->brecv;
+	int    **bsendx_plist = Llu->bsendx_plist;
 	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
 	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
 	int test_flag;
-	int_t bmod_tmp;
+	int bmod_tmp;
 	int thread_id1;
 	doublecomplex *rtemp_loc;
 	int_t nroot_send_tmp;
@@ -1446,9 +1446,9 @@ void zlsum_bmod_inv
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof(double);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	iam = grid->iam;
@@ -1887,7 +1887,7 @@ void zlsum_bmod_inv_master
  doublecomplex *rtemp,   /* Result of full matrix-vector multiply.             */
  int    nrhs,	      /* Number of right-hand sides.                    */
  int_t  k,            /* The k-th component of X.                       */
- int_t  *bmod,        /* Modification count for L-solve.                */
+ int  *bmod,        /* Modification count for L-solve.                */
  int_t  *Urbs,        /* Number of row blocks in each block column of U.*/
  Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/
  int_t  **Ucb_valptr, /* Vertical linked list pointing to Unzval[].     */
@@ -1915,8 +1915,8 @@ void zlsum_bmod_inv_master
 	int_t  *lsub;
 	doublecomplex *lusup;
 	int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
-	int_t  *brecv = Llu->brecv;
-	int_t  **bsendx_plist = Llu->bsendx_plist;
+	int *brecv = Llu->brecv;
+	int  **bsendx_plist = Llu->bsendx_plist;
 	C_Tree  *UBtree_ptr = Llu->UBtree_ptr;
 	C_Tree  *URtree_ptr = Llu->URtree_ptr;
 	MPI_Status status;
@@ -1932,9 +1932,9 @@ void zlsum_bmod_inv_master
 	int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend;
 	int_t iword = sizeof(int_t);
 	int_t dword = sizeof (double);
-	int_t aln_d,aln_i;
-	aln_d = ceil(CACHELINE/(double)dword);
-	aln_i = ceil(CACHELINE/(double)iword);
+	int aln_d,aln_i;
+	aln_d = 1; //ceil(CACHELINE/(double)dword);
+	aln_i = 1; //ceil(CACHELINE/(double)iword);
 
 
 	rtemp_loc = &rtemp[sizertemp* thread_id];
@@ -2093,7 +2093,7 @@ void zlsum_bmod_inv_master
 						z_add(&lsum[il + jj ],
 							  &lsum[il + jj ],
 							  &lsum[il + jj + ii*sizelsum]);
-				// RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[ik],'z')*nrhs+LSUM_H,'z');
+				//RdTree_forwardMessageSimple(URtree_ptr[ik],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[ik],'z')*nrhs+LSUM_H,'z');
 				C_RdTree_forwardMessageSimple(&URtree_ptr[ik],&lsum[il - LSUM_H ],URtree_ptr[ik].msgSize_*nrhs+LSUM_H);
 
 #if ( DEBUGlevel>=2 )
@@ -2187,8 +2187,8 @@ void zlsum_bmod_inv_master
 						// fflush(stdout);
 					// }
 					if(UBtree_ptr[lk1].empty_==NO){
-					// BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'z')*nrhs+XK_H,'z');
-					C_BcTree_forwardMessageSimple(&UBtree_ptr[lk1], &x[ii - XK_H], UBtree_ptr[lk1].msgSize_*nrhs+XK_H);
+					  //BcTree_forwardMessageSimple(UBtree_ptr[lk1],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk1],'z')*nrhs+XK_H,'z');
+					  C_BcTree_forwardMessageSimple(&UBtree_ptr[lk1], &x[ii - XK_H], UBtree_ptr[lk1].msgSize_*nrhs+XK_H);
 					}
 
 					/*
diff --git a/SRC/pzsymbfact_distdata.c b/SRC/pzsymbfact_distdata.c
index 55dec4b8..568abc18 100644
--- a/SRC/pzsymbfact_distdata.c
+++ b/SRC/pzsymbfact_distdata.c
@@ -627,8 +627,8 @@ dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
       while (i < k + nnzToRecv[p]) {
 	gb = rcv_luind[i];
 	if (gb >= nsupers)
-	  printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
-		  iam, p, gb, nsupers, i, i-k);
+	  printf ("Pe[%d] p %d gb %d nsupers %d i " IFMT " i-k " IFMT "\n",
+		  iam, p, (int) gb, (int) nsupers, i, i-k);
 	i += 2;
 	if (sendL) gb_l = LBj( gb, grid );
 	if (sendU) gb_l = LBi( gb, grid );
@@ -1217,7 +1217,7 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int_t *index;        /* indices consist of headers and row subscripts */
   int   *index1;       /* temporary pointer to array of int */
   doublecomplex *lusup, *uval; /* nonzero values in L and U */
-  int_t *recvBuf;
+  int *recvBuf;    // 1/16/22 Sherry changed to int, was:  int_t *recvBuf;
   int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
   doublecomplex **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
   doublecomplex **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
@@ -1245,17 +1245,17 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
   int  *ToRecv, *ToSendD, **ToSendR;
 
   /*-- Counts to be used in lower triangular solve. --*/
-  int_t  *fmod;          /* Modification count for L-solve.        */
-  int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nfsendx = 0;    /* Number of Xk I will send               */
-  int_t  kseen;
+  int  *fmod;          /* Modification count for L-solve.        */
+  int  **fsendx_plist; /* Column process list to send down Xk.   */
+  int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nfsendx = 0;    /* Number of Xk I will send               */
+  int  kseen;
 
   /*-- Counts to be used in upper triangular solve. --*/
-  int_t  *bmod;          /* Modification count for U-solve.        */
-  int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-  int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-  int_t  nbsendx = 0;    /* Number of Xk I will send               */
+  int  *bmod;          /* Modification count for U-solve.        */
+  int  **bsendx_plist; /* Column process list to send down Xk.   */
+  int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+  int  nbsendx = 0;    /* Number of Xk I will send               */
   int_t  *ilsum;         /* starting position of each supernode in
 			    the full array (local)                 */
   int_t  *ilsum_j, ldaspa_j; /* starting position of each supernode in
@@ -1280,8 +1280,9 @@ doublecomplex *dense, *dense_col; /* SPA */
   int_t ldaspa;     /* LDA of SPA */
   int_t iword, dword;
   float mem_use = 0.0;
-  int_t *mod_bit;
-  int_t *frecv, *brecv, *lloc;
+  int *mod_bit;
+  int *frecv, *brecv;
+  int_t *lloc;
   double *SeedSTD_BC,*SeedSTD_RD;
   int_t idx_indx,idx_lusup;
   int_t nbrow;
@@ -1466,11 +1467,11 @@ doublecomplex *dense, *dense_col; /* SPA */
     return (memDist + memNLU + memTRS);
   }
   /* These counts will be used for triangular solves. */
-  if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(fmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for fmod[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
+  if ( !(bmod = int32Calloc_dist(nsupers_i)) ) {
     fprintf(stderr, "Calloc fails for bmod[].");
     return (memDist + memNLU + memTRS);
   }
@@ -1518,29 +1519,29 @@ doublecomplex *dense, *dense_col; /* SPA */
   Lindval_loc_bc_ptr[nsupers_j-1] = NULL;
 
   /* These lists of processes will be used for triangular solves. */
-  if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+  if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
   len = nsupers_j * grid->nprow;
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for fsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    fsendx_plist[i] = &index[j];
-  if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
+    fsendx_plist[i] = &index1[j];
+  if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(nsupers_j*sizeof(int*))) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[].");
     return (memDist + memNLU + memTRS);
   }
-  if ( !(index = intMalloc_dist(len)) ) {
+  if ( !(index1 = int32Malloc_dist(len)) ) {
     fprintf(stderr, "Malloc fails for bsendx_plist[0]");
     return (memDist + memNLU + memTRS);
   }
-  for (i = 0; i < len; ++i) index[i] = EMPTY;
+  for (i = 0; i < len; ++i) index1[i] = EMPTY;
   for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
-    bsendx_plist[i] = &index[j];
+    bsendx_plist[i] = &index1[j];
   /* -------------------------------------------------------------- */
   memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
 
@@ -1564,15 +1565,15 @@ doublecomplex *dense, *dense_col; /* SPA */
 	    printf ("ERR7\n");
 	  jcol = asup_colind[i];
 	  if (jcol >= n)
-	    printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
-		    iam, jb, gb, j, jcol);
+	    printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
+		    iam, (int) jb, (int) gb, (int) j, jcol);
 	  gb = BlockNum( jcol );
 	  lb = LBj( gb, grid );
 	  if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
 	  jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
 	  if (jcol >= ldaspa_j)
-	    printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
-		    iam, jb, gb, j, jcol);
+	    printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
+		    iam, (int) jb, (int) gb, (int) j, jcol);
 	  dense_col[jcol] = asup_val[i];
 	}
 	dense_col += ldaspa_j;
@@ -1800,7 +1801,7 @@ doublecomplex *dense, *dense_col; /* SPA */
 	Lrowind_bc_ptr[ljb_j] = index;
 	if (!(Lnzval_bc_ptr[ljb_j] =
 	      doublecomplexMalloc_dist(len*nsupc))) {
-	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+	  fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d\n", (int) jb);
 	  return (memDist + memNLU + memTRS);
 	}
 
@@ -1946,7 +1947,7 @@ doublecomplex *dense, *dense_col; /* SPA */
 
   /* exchange information about bsendx_plist in between column of processors */
   k = SUPERLU_MAX( grid->nprow, grid->npcol);
-  if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
+  if ( !(recvBuf = (int *) SUPERLU_MALLOC(nsupers*k* sizeof(int))) ) {
     fprintf (stderr, "Malloc fails for recvBuf[].");
     return (memDist + memNLU + memTRS);
   }
@@ -2002,8 +2003,13 @@ doublecomplex *dense, *dense_col; /* SPA */
     }
   }
 
+#if 0 // Sherry 
   MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
 		 recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+#else		 
+  MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, MPI_INT,
+		 recvBuf, nnzToRecv, ptrToRecv, MPI_INT, grid->comm);
+#endif
 
   for (jb = 0; jb < nsupers; jb++) {
     jbcol = PCOL( jb, grid );
@@ -2034,8 +2040,13 @@ doublecomplex *dense, *dense_col; /* SPA */
   }
 
   /* exchange information about bsendx_plist in between column of processors */
+#if 0 // Sherry 1/16/2022
   MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
 		 MPI_MAX, grid->cscp.comm);
+#else
+  MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, MPI_INT,
+		 MPI_MAX, grid->cscp.comm);
+#endif
 
   for (jb = 0; jb < nsupers; jb ++) {
     jbcol = PCOL( jb, grid);
@@ -2239,9 +2250,8 @@ doublecomplex *dense, *dense_col; /* SPA */
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-				// LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
-				// BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
-
+				//LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
+				//BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z');
 				C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z');
 				LBtree_ptr[ljb].tag_=BC_L;
 
@@ -2294,9 +2304,9 @@ doublecomplex *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for L ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(frecv = intMalloc_dist(nlb)) )
+		if ( !(frecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for frecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2311,9 +2321,11 @@ doublecomplex *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
+#if 0 // Sherry
 		MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+#else		
+		MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif
 
 		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 		if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
@@ -2411,10 +2423,10 @@ doublecomplex *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						// LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
-						// RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
-            C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
-            LRtree_ptr[lib].tag_=RD_L;
+					//LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
+					//RdTree_SetTag(LRtree_ptr[lib], RD_L,'z');
+					C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
+					LRtree_ptr[lib].tag_=RD_L;
 						// }
 
 						// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
@@ -2567,9 +2579,8 @@ doublecomplex *dense, *dense_col; /* SPA */
 					// rseed=rand();
 					// rseed=1.0;
 					msgsize = SuperSize( jb );
-				// UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
-				// BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
-
+				//UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z');
+				//BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z');
 				C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z');
 				UBtree_ptr[ljb].tag_=BC_U;
 
@@ -2610,9 +2621,9 @@ doublecomplex *dense, *dense_col; /* SPA */
 		/* construct the Reduce tree for U ... */
 		/* the following is used as reference */
 		nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-		if ( !(mod_bit = intMalloc_dist(nlb)) )
+		if ( !(mod_bit = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for mod_bit[].");
-		if ( !(brecv = intMalloc_dist(nlb)) )
+		if ( !(brecv = int32Malloc_dist(nlb)) )
 			ABORT("Malloc fails for brecv[].");
 
 		for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -2627,9 +2638,11 @@ doublecomplex *dense, *dense_col; /* SPA */
 		}
 		/* Every process receives the count, but it is only useful on the
 		   diagonal processes.  */
+#if 0 // Sherry
 		MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+#else		
+		MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+#endif		
 
 		k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
 		if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
@@ -2726,10 +2739,10 @@ doublecomplex *dense, *dense_col; /* SPA */
 
 						// if(ib==0){
 
-						// URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
-						// RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
-            C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
-			      URtree_ptr[lib].tag_=RD_U;
+					//URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z');
+					//RdTree_SetTag(URtree_ptr[lib], RD_U,'z');
+					C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z');
+					URtree_ptr[lib].tag_=RD_U;
 						// }
 
 						// #if ( PRNTlevel>=1 )
@@ -2820,7 +2833,7 @@ doublecomplex *dense, *dense_col; /* SPA */
 #endif
 
   k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-  if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+  if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
       ABORT("Malloc fails for mod_bit[].");
 
   /* Find the maximum buffer size. */
diff --git a/SRC/sdistribute.c b/SRC/sdistribute.c
index 964f7ce4..0b422f1b 100644
--- a/SRC/sdistribute.c
+++ b/SRC/sdistribute.c
@@ -98,10 +98,10 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
     float **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
     int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
-	BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
-	BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-	RdTree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	C_Tree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	C_Tree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
 	int msgsize;
 
     int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
@@ -112,19 +112,20 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
     int  *ToRecv, *ToSendD, **ToSendR;
 
     /*-- Counts to be used in lower triangular solve. --*/
-    int_t  *fmod;          /* Modification count for L-solve.        */
-    int_t  **fsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nfrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nfsendx = 0;    /* Number of Xk I will send               */
-    int_t  kseen;
+    int  *fmod;          /* Modification count for L-solve.        */
+    int  **fsendx_plist; /* Column process list to send down Xk.   */
+    int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nfsendx = 0;    /* Number of Xk I will send               */
+    int  kseen;
 
     /*-- Counts to be used in upper triangular solve. --*/
-    int_t  *bmod;          /* Modification count for U-solve.        */
-    int_t  **bsendx_plist; /* Column process list to send down Xk.   */
-    int_t  nbrecvx = 0;    /* Number of Xk I will receive.           */
-    int_t  nbsendx = 0;    /* Number of Xk I will send               */
-    int_t  *ilsum;         /* starting position of each supernode in
-			      the full array (local)                 */
+    int  *bmod;          /* Modification count for U-solve.        */
+    int  **bsendx_plist; /* Column process list to send down Xk.   */
+    int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nbsendx = 0;    /* Number of Xk I will send               */
+    
+    int_t  *ilsum;       /* starting position of each supernode in
+		            the full array (local)                 */
 
     /*-- Auxiliary arrays; freed on return --*/
     int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
@@ -150,8 +151,9 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
     int_t iword, sword;
     float mem_use = 0.0;
 
-    int_t *mod_bit;
-    int_t *frecv, *brecv, *lloc;
+    int *mod_bit;
+    int *frecv, *brecv;
+    int_t *lloc;
     float **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     float **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
     double *SeedSTD_BC,*SeedSTD_RD;
@@ -346,7 +348,7 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	    ABORT("Malloc fails for ToSendR[].");
 	j = k * grid->npcol;
 	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
-	    ABORT("Malloc fails for index[].");
+	    ABORT("Malloc fails for index1[].");
 
 	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
 
@@ -490,9 +492,9 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	    ABORT("Calloc fails for SPA dense[].");
 
 	/* These counts will be used for triangular solves. */
-	if ( !(fmod = intCalloc_dist(k)) )
+	if ( !(fmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for fmod[].");
-	if ( !(bmod = intCalloc_dist(k)) )
+	if ( !(bmod = int32Calloc_dist(k)) )
 	    ABORT("Calloc fails for bmod[].");
 #if ( PRNTlevel>=1 )
 	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*sword;
@@ -522,28 +524,27 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	Linv_bc_ptr[k-1] = NULL;
 	Uinv_bc_ptr[k-1] = NULL;
 
-	if ( !(Unnz =
-			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
+	if ( !(Unnz = (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
 	ABORT("Malloc fails for Unnz[].");
 
 	/* These lists of processes will be used for triangular solves. */
-	if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for fsendx_plist[].");
 	len = k * grid->nprow;
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for fsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    fsendx_plist[i] = &index[j];
-	if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
+	    fsendx_plist[i] = &index1[j];
+	if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
 	    ABORT("Malloc fails for bsendx_plist[].");
-	if ( !(index = intMalloc_dist(len)) )
+	if ( !(index1 = int32Malloc_dist(len)) )
 	    ABORT("Malloc fails for bsendx_plist[0]");
-	for (i = 0; i < len; ++i) index[i] = EMPTY;
+	for (i = 0; i < len; ++i) index1[i] = EMPTY;
 	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
-	    bsendx_plist[i] = &index[j];
+	    bsendx_plist[i] = &index1[j];
 
-	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+	mem_use += 4.0*k*sizeof(int*) + 2.0*len*sizeof(int);
 
 	/*------------------------------------------------------------
 	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
@@ -913,7 +914,7 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	/* construct the Bcast tree for L ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-	if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+	if ( !(LBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for LBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -930,7 +931,7 @@ sdistribute(fact_t fact, int_t n, SuperMatrix *A,
 	MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm);
 
 	for (ljb = 0; ljb comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-				BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				//LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s');
+				C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				LBtree_ptr[ljb].tag_=BC_L;
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
@@ -1051,9 +1054,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	/* construct the Reduce tree for L ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(frecv = intMalloc_dist(nlb)) )
+	if ( !(frecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for frecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1068,12 +1071,11 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+        //MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+	MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+	if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for LRtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1116,7 +1118,7 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 
 
 	for (lib = 0; lib comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-					RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					//LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(LRtree_ptr[lib], RD_L,'s');
+					C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					LRtree_ptr[lib].tag_=RD_L;
 					// }
 
 					// printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt);
@@ -1214,7 +1218,6 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(frecv);
 
-
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
@@ -1239,7 +1242,7 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 	/* construct the Bcast tree for U ... */
 
 	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
-	if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) )
+	if ( !(UBtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for UBtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1256,7 +1259,7 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 
 
 	for (ljb = 0; ljb nprow*k)) )
@@ -1339,8 +1342,10 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
 				// rseed=rand();
 				// rseed=1.0;
 				msgsize = SuperSize( jb );
-				UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
-				BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				//UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s');
+				//BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s');
+				C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 's');
+				UBtree_ptr[ljb].tag_=BC_U;
 
 				// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
 				// fflush(stdout);
@@ -1378,9 +1383,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	/* construct the Reduce tree for U ... */
 	/* the following is used as reference */
 	nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(mod_bit = intMalloc_dist(nlb)) )
+	if ( !(mod_bit = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for mod_bit[].");
-	if ( !(brecv = intMalloc_dist(nlb)) )
+	if ( !(brecv = int32Malloc_dist(nlb)) )
 		ABORT("Malloc fails for brecv[].");
 
 	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
@@ -1395,12 +1400,11 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	}
 	/* Every process receives the count, but it is only useful on the
 	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-
-
+	//MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+	MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) )
+	if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
 		ABORT("Malloc fails for URtree_ptr[].");
 	if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) )
 		ABORT("Calloc fails for ActiveFlag[].");
@@ -1462,7 +1466,7 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 
 
 	for (lib = 0; lib comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
-					RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					//URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s');
+					//RdTree_SetTag(URtree_ptr[lib], RD_U,'s');
+					C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 's');
+					URtree_ptr[lib].tag_=RD_U;
 					// }
 
 					// #if ( PRNTlevel>=1 )
@@ -1562,7 +1568,6 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
 	SUPERLU_FREE(mod_bit);
 	SUPERLU_FREE(brecv);
 
-
 	SUPERLU_FREE(ActiveFlag);
 	SUPERLU_FREE(ActiveFlagAll);
 	SUPERLU_FREE(ranks);
@@ -1626,7 +1631,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
 	SUPERLU_FREE(dense);
 
 	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
-	if ( !(Llu->mod_bit = intMalloc_dist(k)) )
+	if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
 	    ABORT("Malloc fails for mod_bit[].");
 
 	/* Find the maximum buffer size. */
diff --git a/SRC/sldperm_dist.c b/SRC/sldperm_dist.c
index 6178c1b0..944274c1 100644
--- a/SRC/sldperm_dist.c
+++ b/SRC/sldperm_dist.c
@@ -22,9 +22,10 @@ at the top-level directory.
 
 #include "superlu_sdefs.h"
 
-extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
-		    int_t*, int_t [], int_t*, int_t[], int_t*, double [],
-		    int_t [], int_t []);
+extern int mc64ad_dist(int *job, int *n, int_t *ne, int_t *ip,
+       int_t *irn, double *a, int *num, int_t *cperm,
+       int_t *liw, int_t *iw, int_t *ldw, double *dw,
+       int * icntl, int *info);
 
 /*! \brief
  *
@@ -82,21 +83,22 @@ extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
  *        The permutation vector. perm[i] = j means row i in the
  *        original matrix is in row j of the permuted matrix.
  *
- * u      (output) double*, of size n
+ * u      (output) float*, of size n
  *        If job = 5, the natural logarithms of the row scaling factors.
  *
- * v      (output) double*, of size n
+ * v      (output) float*, of size n
  *        If job = 5, the natural logarithms of the column scaling factors.
  *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
  * 
*/ int -sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], +sldperm_dist(int job, int n, int_t nnz, int_t colptr[], int_t adjncy[], float nzval[], int_t *perm, float u[], float v[]) { - int_t i, liw, ldw, num; - int_t *iw, icntl[10], info[10]; + int i, num, icntl[10], info[10]; + int_t liw, ldw; + int_t *iw; double *dw; extern double *doubleMalloc_dist(int_t); double *nzval_d = doubleMalloc_dist(nnz); @@ -149,7 +151,7 @@ sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); #endif if ( info[0] == 1 ) { /* Structurally singular */ - printf(".. The last " IFMT " permutations:\n", n-num); + printf(".. The last %d permutations:\n", n-num); PrintInt10("perm", n-num, &perm[num]); } @@ -166,6 +168,7 @@ sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], SUPERLU_FREE(iw); SUPERLU_FREE(dw); + SUPERLU_FREE(nzval_d); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(0, "Exit sldperm_dist()"); diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h index 3f32e344..48196c3b 100644 --- a/SRC/slustruct_gpu.h +++ b/SRC/slustruct_gpu.h @@ -119,9 +119,9 @@ typedef struct //LUstruct_gpu_ double tHost_PCIeH2D; double tHost_PCIeD2H; - /*gpu events to measure DGEMM and SCATTER timing */ + /*GPU events to measure DGEMM and SCATTER timing */ int *isOffloaded; /*stores if any iteration is offloaded or not*/ - gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/ gpuEvent_t *ePCIeH2D; gpuEvent_t *ePCIeD2H_Start; gpuEvent_t *ePCIeD2H_End; diff --git a/SRC/ssp_blas2_dist.c b/SRC/ssp_blas2_dist.c index 279b97e8..097e7828 100644 --- a/SRC/ssp_blas2_dist.c +++ b/SRC/ssp_blas2_dist.c @@ -420,7 +420,7 @@ sp_sgemv_dist(char *trans, float alpha, SuperMatrix *A, float *x, } /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.) + if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.)) return 0; /* Set LENX and LENY, the lengths of the vectors x and y, and set diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu index 110836ab..43e3dd8e 100644 --- a/SRC/ssuperlu_gpu.cu +++ b/SRC/ssuperlu_gpu.cu @@ -18,14 +18,9 @@ #undef Reduce -//#include +//#include #include "slustruct_gpu.h" -#ifdef HAVE_CUDA -#include "superlu_gpu_utils.cu" -#elif defined(HAVE_HIP) -#include "superlu_gpu_utils.hip.cpp" -#endif //extern "C" { @@ -38,7 +33,7 @@ // #if defined(DEBUG) || defined(_DEBUG) // if (result != GPUBLAS_STATUS_SUCCESS) // { -// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result)); +// fprintf(stderr, "GPU BLAS Runtime Error: %s\n", gpublasGetErrorString(result)); // assert(result == GPUBLAS_STATUS_SUCCESS); // } // #endif @@ -226,6 +221,9 @@ void Scatter_GPU_kernel( int nsupc = SuperSize (jb); int ljb = jb / npcol; + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + float *tempv1; if (jj_st == jj0) { @@ -868,7 +866,7 @@ int sinitSluGPU3D_t( int_t ldt /* NSUP read from sp_ienv(3) */ ) { - checkGPUErrors(gpuDeviceReset ()) ; + checkGPUErrors(gpuDeviceReset ()); Glu_persist_t *Glu_persist = LUstruct->Glu_persist; sLocalLU_t *Llu = LUstruct->Llu; int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 0798638f..ae0163f1 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -89,45 +89,6 @@ typedef struct { } Ucb_indptr_t; #endif - - -/* - * CONSTANTS in MAGMA - */ -#ifndef MAGMA_CONST -#define MAGMA_CONST - - - -// #define DIM_X 32 -// #define DIM_Y 16 - -#define DIM_X 16 -#define DIM_Y 16 - - -#define BLK_M DIM_X*4 -#define BLK_N DIM_Y*4 -#define BLK_K 2048/(BLK_M) - -#define DIM_XA DIM_X -#define DIM_YA DIM_Y -#define DIM_XB DIM_X -#define DIM_YB DIM_Y - -#define NWARP DIM_X*DIM_Y/32 - -// // // // // // #define TILE_SIZE 32 - - -#define THR_M ( BLK_M / DIM_X ) -#define THR_N ( BLK_N / DIM_Y ) - -#define fetch(A, m, n, bound) offs_d##A[min(n*LD##A+m, bound)] -#define fma(A, B, C) C += (A*B) -#endif - - /* * On each processor, the blocks in L are stored in compressed block * column format, the blocks in U are stored in compressed block row format. @@ -135,43 +96,41 @@ typedef struct { #define MAX_LOOKAHEADS 50 typedef struct { int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */ + int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */ - long int Lrowind_bc_cnt; + long int Lrowind_bc_cnt; double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */ - long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */ - long int Lnzval_bc_cnt; - + double *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */ + long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Lnzval_bc_cnt; + double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ - long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ - long int Linv_bc_cnt; - + double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ + long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Linv_bc_cnt; + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ - int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */ - long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */ - long int Lindval_loc_bc_cnt; - + int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */ + long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Lindval_loc_bc_cnt; int_t *Unnz; /* number of nonzeros per block column in U*/ - int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ - double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ + double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double *Uinv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */ - long int Uinv_bc_cnt; - + long int Uinv_bc_cnt; int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t *Ufstnz_br_dat; /* size sum of sizes of Ufstnz_br_ptr[lk]) */ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */ long int Ufstnz_br_cnt; - - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - double *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */ - long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */ + + double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + double *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */ + long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */ long int Unzval_br_cnt; - + /*-- Data structures used for broadcast and reduction trees. --*/ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ @@ -202,17 +161,18 @@ typedef struct { int **ToSendR; /* List of processes to send right block col. */ /*-- Record communication schedule for forward/back solves. --*/ - int_t *fmod; /* Modification count for L-solve */ - int_t **fsendx_plist; /* Column process list to send down Xk */ - int_t *frecv; /* Modifications to be recv'd in proc row */ - int_t nfrecvx; /* Number of Xk I will receive in L-solve */ - int_t nfsendx; /* Number of Xk I will send in L-solve */ - int_t *bmod; /* Modification count for U-solve */ - int_t **bsendx_plist; /* Column process list to send down Xk */ - int_t *brecv; /* Modifications to be recv'd in proc row */ - int_t nbrecvx; /* Number of Xk I will receive in U-solve */ - int_t nbsendx; /* Number of Xk I will send in U-solve */ - int_t *mod_bit; /* Flag contribution from each row blocks */ + /* 1/15/22 Sherry: changed int_t to int type */ + int *fmod; /* Modification count for L-solve */ + int **fsendx_plist; /* Column process list to send down Xk */ + int *frecv; /* Modifications to be recv'd in proc row */ + int nfrecvx; /* Number of Xk I will receive in L-solve */ + int nfsendx; /* Number of Xk I will send in L-solve */ + int *bmod; /* Modification count for U-solve */ + int **bsendx_plist; /* Column process list to send down Xk */ + int *brecv; /* Modifications to be recv'd in proc row */ + int nbrecvx; /* Number of Xk I will receive in U-solve */ + int nbsendx; /* Number of Xk I will send in U-solve */ + int *mod_bit; /* Flag contribution from each row blocks */ /*-- Auxiliary arrays used for forward/back solves. --*/ int_t *ilsum; /* Starting position of each supernode in lsum @@ -244,14 +204,12 @@ typedef struct { Ucb_indptr_t *Ucb_inddat; long int *Ucb_indoffset; long int Ucb_indcnt; - + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t *Ucb_valdat; long int *Ucb_valoffset; long int Ucb_valcnt; - - /* some additional counters for L solve */ int_t n; int_t nleaf; @@ -275,20 +233,21 @@ typedef struct { int_t *d_Ufstnz_br_dat; long int *d_Ufstnz_br_offset; double *d_Unzval_br_dat; - long int *d_Unzval_br_offset; + long int *d_Unzval_br_offset; int_t *d_Ucb_valdat; long int *d_Ucb_valoffset; Ucb_indptr_t *d_Ucb_inddat; long int *d_Ucb_indoffset; - int_t *d_ilsum ; - int_t *d_xsup ; + int_t *d_ilsum ; + int_t *d_xsup ; C_Tree *d_LBtree_ptr ; C_Tree *d_LRtree_ptr ; C_Tree *d_UBtree_ptr ; C_Tree *d_URtree_ptr ; #endif + } dLocalLU_t; @@ -542,7 +501,7 @@ extern int_t pdgstrs_init(int_t, int_t, int_t, int_t, int_t [], int_t [], gridinfo_t *grid, Glu_persist_t *, dSOLVEstruct_t *); extern void pxgstrs_finalize(pxgstrs_comm_t *); -extern int dldperm_dist(int_t, int_t, int_t, int_t [], int_t [], +extern int dldperm_dist(int, int, int_t, int_t [], int_t [], double [], int_t *, double [], double []); extern int dstatic_schedule(superlu_dist_options_t *, int, int, dLUstruct_t *, gridinfo_t *, SuperLUStat_t *, @@ -585,39 +544,39 @@ extern int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, dScalePermstruct_t *, Glu_persist_t *, gridinfo_t *, dSOLVEstruct_t *); extern void dlsum_fmod(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, int_t, int_t, + int, int, int_t , int *fmod, int_t, int_t, int_t, int_t *, gridinfo_t *, dLocalLU_t *, MPI_Request [], SuperLUStat_t *); extern void dlsum_bmod(double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, dLocalLU_t *, MPI_Request [], SuperLUStat_t *); extern void dlsum_fmod_inv(double *, double *, double *, double *, - int, int_t , int_t *, + int, int_t , int *fmod, int_t *, gridinfo_t *, dLocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int); -extern void dComputeLevelsets(int , int_t , gridinfo_t *, - Glu_persist_t *, dLocalLU_t *, int_t *); - -#ifdef GPU_ACC -extern void dlsum_fmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, double *,double *,int,int, int_t , int_t *, C_Tree *, C_Tree *, int_t *, int_t *,long int *, double *, long int *, double *, long int *, int_t *, long int *, int_t *, gridinfo_t *, double * , double * , int_t ); -extern void dlsum_bmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, double *,double *,int,int, int_t , int_t *, C_Tree *, C_Tree *, int_t *, int_t *,int_t *, long int *,double *,long int *,int_t *,long int *,Ucb_indptr_t *,long int *,double *,long int *,int_t *,gridinfo_t *); -#endif - extern void dlsum_fmod_inv_master(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, + int, int, int_t , int *fmod, int_t, int_t *, gridinfo_t *, dLocalLU_t *, SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int); extern void dlsum_bmod_inv(double *, double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, dLocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int); extern void dlsum_bmod_inv_master(double *, double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, dLocalLU_t *, SuperLUStat_t **, int_t, int_t, int, int); +extern void dComputeLevelsets(int , int_t , gridinfo_t *, + Glu_persist_t *, dLocalLU_t *, int_t *); + +#ifdef GPU_ACC +extern void dlsum_fmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, double *, double *, int, int, int_t , int *fmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, double *, int64_t *, double *, int64_t *, int_t *, int64_t *, int_t *, gridinfo_t *, double * , double * , int_t ); +extern void dlsum_bmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, double *, double *,int,int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *,int_t *, int64_t *, double *, int64_t *, int_t *, int64_t *, Ucb_indptr_t *, int64_t *, double *, int64_t *,int_t *,gridinfo_t *); +#endif + extern void pdgsrfs(int_t, SuperMatrix *, double, dLUstruct_t *, dScalePermstruct_t *, gridinfo_t *, double [], int_t, double [], int_t, int, @@ -692,6 +651,7 @@ extern int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *); extern int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A); extern void Printdouble5(char *, int_t, double *); extern int file_Printdouble5(FILE *, char *, int_t, double *); + extern void dGenCOOLblocks(int, int_t, gridinfo_t*, Glu_persist_t*, dLocalLU_t *, int_t** , int_t** , double ** , int_t* , int_t* ); extern void dGenCSCLblocks(int, int_t, gridinfo_t*, @@ -699,6 +659,7 @@ extern void dGenCSCLblocks(int, int_t, gridinfo_t*, extern void dGenCSRLblocks(int, int_t, gridinfo_t*, Glu_persist_t*, dLocalLU_t *, double **, int_t **, int_t **, int_t*, int_t*); + /* BLAS */ #ifdef USE_VENDOR_BLAS @@ -758,7 +719,6 @@ extern int superlu_dtrsv(char *uplo, char *trans, char *diag, int n, double *a, int lda, double *x, int incx); #ifdef SLU_HAVE_LAPACK -// LAPACK routine extern void dtrtri_(char*, char*, int*, double*, int*, int*); #endif @@ -1012,19 +972,13 @@ extern int_t dIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, MPI_Request *, dLocalLU_t *, int); extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*, dLocalLU_t *, gridinfo_t*, MPI_Request *, int); -extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, - MPI_Request *s, SCT_t*); -extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); -extern int_t Check_LRecv(MPI_Request*, int* msgcnt); extern int_t dWait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, gridinfo_t *, SCT_t*); extern int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, MPI_Request *, gridinfo_t *, int); extern int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, int_t src, gridinfo_t *, SCT_t*, int); -extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); -extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *, gridinfo_t *, dLocalLU_t *); extern int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size, @@ -1032,16 +986,12 @@ extern int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size, extern int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, int_t src, MPI_Request *, gridinfo_t *, SCT_t*, int); -extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); -extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); extern int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size, int_t src, MPI_Request *, gridinfo_t*, SCT_t*, int); -extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); -extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); - extern int_t dUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, MPI_Request *, gridinfo_t *, dLUstruct_t *, SCT_t *); extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *); + #if (MPI_VERSION>2) extern int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size, MPI_Request *, gridinfo_t *); @@ -1060,8 +1010,6 @@ extern int_t dDiagFactIBCast(int_t k, int_t k0, extern int_t dUPanelTrSolve( int_t k, double* BlockLFactor, double* bigV, int_t ldt, Ublock_info_t*, gridinfo_t *, dLUstruct_t *, SuperLUStat_t *, SCT_t *); -extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *, - gridinfo_t *, SCT_t *); extern int_t dLPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, MPI_Request *, double* BlockUFactor, gridinfo_t *, dLUstruct_t *, SCT_t *); diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b5691810..4de1431f 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -51,6 +51,7 @@ at the top-level directory. #include #include #include +#include // #include #include #include @@ -85,10 +86,10 @@ at the top-level directory. #include "superlu_dist_config.h" #ifdef HAVE_CUDA -#ifndef GPU_ACC #define GPU_ACC +//#include "cublas_utils.h" #endif -#endif + #ifdef HAVE_HIP #ifndef GPU_ACC #define GPU_ACC @@ -96,7 +97,6 @@ at the top-level directory. #endif #ifdef GPU_ACC -//#include "gpu_wrapper.h" #include "gpu_api_utils.h" #endif @@ -296,6 +296,42 @@ static const int RD_U=4; /* MPI tag for lsum in U-solve*/ #endif /* MSVC */ #endif /* SUPERLU_DIST_EXPORT */ + +/* + * CONSTANTS in MAGMA + */ +#ifndef MAGMA_CONST +#define MAGMA_CONST + +// #define DIM_X 32 +// #define DIM_Y 16 + +#define DIM_X 16 +#define DIM_Y 16 + + +#define BLK_M DIM_X*4 +#define BLK_N DIM_Y*4 +#define BLK_K 2048/(BLK_M) + +#define DIM_XA DIM_X +#define DIM_YA DIM_Y +#define DIM_XB DIM_X +#define DIM_YB DIM_Y + +#define NWARP DIM_X*DIM_Y/32 + +// // // // // // #define TILE_SIZE 32 + + +#define THR_M ( BLK_M / DIM_X ) +#define THR_N ( BLK_N / DIM_Y ) + +#define fetch(A, m, n, bound) offs_d##A[min(n*LD##A+m, bound)] +#define fma(A, B, C) C += (A*B) +#endif +/*---- end MAGMA ----*/ + #ifdef __cplusplus extern "C" { #endif @@ -940,7 +976,7 @@ typedef struct xtrsTimer_t double ppXmem; // perprocess X-memory } xtrsTimer_t; -/*==== For 3D code ====*/ +/*==== end For 3D code ====*/ /*====================*/ @@ -996,9 +1032,11 @@ extern float smach_dist(char *); extern double dmach_dist(char *); extern void *superlu_malloc_dist (size_t); extern void superlu_free_dist (void*); +extern int *int32Malloc_dist (int); +extern int *int32Calloc_dist (int); extern int_t *intMalloc_dist (int_t); extern int_t *intCalloc_dist (int_t); -extern int_t mc64id_dist(int_t *); +extern int mc64id_dist(int *); extern void arrive_at_ublock (int_t, int_t *, int_t *, int_t *, int_t *, int_t *, int_t, int_t, int_t *, int_t *, int_t *, gridinfo_t *); @@ -1239,15 +1277,21 @@ extern int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr extern void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup, int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, gridinfo3d_t * grid3d); +extern int Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, + MPI_Request *L_diag_blk_send_req, + gridinfo_t *grid, SCT_t *SCT); + extern int getNsupers(int n, Glu_persist_t *Glu_persist); extern int set_tag_ub(); extern int getNumThreads(int); extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, gridinfo_t *, int_t *, int_t *); + #if 0 // Sherry: conflicting with existing routine extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, Glu_persist_t *, gridinfo_t*, int_t* perm_u); #endif + extern int_t* getFactPerm(int_t); extern int_t* getFactIperm(int_t*, int_t); @@ -1288,6 +1332,19 @@ extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount extern int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d); + /* from communication_aux.h */ +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, + MPI_Request *s, SCT_t*); +extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Check_LRecv(MPI_Request*, int* msgcnt); +extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *); + extern int getnGPUStreams(); extern int get_mpi_process_per_gpu (); diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 45154eba..a477e6e1 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -#define HAVE_CUDA TRUE +/* #undef HAVE_CUDA */ /* Enable HIP */ /* #undef HAVE_HIP */ diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h index 3103e46e..aca7df37 100644 --- a/SRC/superlu_enum_consts.h +++ b/SRC/superlu_enum_consts.h @@ -39,8 +39,11 @@ typedef enum {USUB, LSUB, UCOL, LUSUP, LLVL, ULVL, NO_MEMTYPE} MemType; typedef enum {HEAD, TAIL} stack_end_t; typedef enum {SYSTEM, USER} LU_space_t; typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t; + +/* + * The following are for ILUTP in serial SuperLU + */ typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t; -#if 0 typedef enum {NODROP = 0x0000, DROP_BASIC = 0x0001, /* ILU(tau) */ DROP_PROWS = 0x0002, /* ILUTP: keep p maximum rows */ @@ -52,7 +55,6 @@ typedef enum {NODROP = 0x0000, DROP_SECONDARY = 0x000E, /* PROWS | COLUMN | AREA */ DROP_DYNAMIC = 0x0010, DROP_INTERP = 0x0100} rule_t; -#endif /* diff --git a/SRC/superlu_gpu_utils.cu b/SRC/superlu_gpu_utils.cu index 088cb922..90e6ea6f 100644 --- a/SRC/superlu_gpu_utils.cu +++ b/SRC/superlu_gpu_utils.cu @@ -12,7 +12,6 @@ cudaError_t checkCuda(cudaError_t result) #endif return result; } -#endif __device__ int dnextpow2(int v) diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 9af2d7ba..9c71e5ea 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -96,14 +96,41 @@ typedef struct { #define MAX_LOOKAHEADS 50 typedef struct { int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */ + long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Lrowind_bc_cnt; + doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */ + long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Lnzval_bc_cnt; + doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ + long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Linv_bc_cnt; + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ + int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */ + long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Lindval_loc_bc_cnt; int_t *Unnz; /* number of nonzeros per block column in U*/ - int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ - doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ + doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex *Uinv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ + long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */ + long int Uinv_bc_cnt; + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t *Ufstnz_br_dat; /* size sum of sizes of Ufstnz_br_ptr[lk]) */ + long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */ + long int Ufstnz_br_cnt; + + doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + doublecomplex *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */ + long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */ + long int Unzval_br_cnt; + /*-- Data structures used for broadcast and reduction trees. --*/ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ @@ -134,17 +161,18 @@ typedef struct { int **ToSendR; /* List of processes to send right block col. */ /*-- Record communication schedule for forward/back solves. --*/ - int_t *fmod; /* Modification count for L-solve */ - int_t **fsendx_plist; /* Column process list to send down Xk */ - int_t *frecv; /* Modifications to be recv'd in proc row */ - int_t nfrecvx; /* Number of Xk I will receive in L-solve */ - int_t nfsendx; /* Number of Xk I will send in L-solve */ - int_t *bmod; /* Modification count for U-solve */ - int_t **bsendx_plist; /* Column process list to send down Xk */ - int_t *brecv; /* Modifications to be recv'd in proc row */ - int_t nbrecvx; /* Number of Xk I will receive in U-solve */ - int_t nbsendx; /* Number of Xk I will send in U-solve */ - int_t *mod_bit; /* Flag contribution from each row blocks */ + /* 1/15/22 Sherry: changed int_t to int type */ + int *fmod; /* Modification count for L-solve */ + int **fsendx_plist; /* Column process list to send down Xk */ + int *frecv; /* Modifications to be recv'd in proc row */ + int nfrecvx; /* Number of Xk I will receive in L-solve */ + int nfsendx; /* Number of Xk I will send in L-solve */ + int *bmod; /* Modification count for U-solve */ + int **bsendx_plist; /* Column process list to send down Xk */ + int *brecv; /* Modifications to be recv'd in proc row */ + int nbrecvx; /* Number of Xk I will receive in U-solve */ + int nbsendx; /* Number of Xk I will send in U-solve */ + int *mod_bit; /* Flag contribution from each row blocks */ /*-- Auxiliary arrays used for forward/back solves. --*/ int_t *ilsum; /* Starting position of each supernode in lsum @@ -173,13 +201,53 @@ typedef struct { int_t *ut_modbit; int_t *Urbs; Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + Ucb_indptr_t *Ucb_inddat; + long int *Ucb_indoffset; + long int Ucb_indcnt; + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t *Ucb_valdat; + long int *Ucb_valoffset; + long int Ucb_valcnt; /* some additional counters for L solve */ int_t n; int_t nleaf; int_t nfrecvmod; int_t inv; /* whether the diagonal block is inverted*/ + + /* The following variables are used in GPU trisolve*/ +#ifdef GPU_ACC + int_t *d_Lrowind_bc_dat; + long int *d_Lrowind_bc_offset; + doublecomplex *d_Lnzval_bc_dat; + long int *d_Lnzval_bc_offset; + doublecomplex *d_Linv_bc_dat ; + doublecomplex *d_Uinv_bc_dat ; + long int *d_Linv_bc_offset ; + long int *d_Uinv_bc_offset ; + int_t *d_Lindval_loc_bc_dat ; + long int *d_Lindval_loc_bc_offset ; + + int_t *d_Urbs; + int_t *d_Ufstnz_br_dat; + long int *d_Ufstnz_br_offset; + doublecomplex *d_Unzval_br_dat; + long int *d_Unzval_br_offset; + + int_t *d_Ucb_valdat; + long int *d_Ucb_valoffset; + Ucb_indptr_t *d_Ucb_inddat; + long int *d_Ucb_indoffset; + + int_t *d_ilsum ; + int_t *d_xsup ; + C_Tree *d_LBtree_ptr ; + C_Tree *d_LRtree_ptr ; + C_Tree *d_UBtree_ptr ; + C_Tree *d_URtree_ptr ; +#endif + } zLocalLU_t; @@ -433,7 +501,7 @@ extern int_t pzgstrs_init(int_t, int_t, int_t, int_t, int_t [], int_t [], gridinfo_t *grid, Glu_persist_t *, zSOLVEstruct_t *); extern void pxgstrs_finalize(pxgstrs_comm_t *); -extern int zldperm_dist(int_t, int_t, int_t, int_t [], int_t [], +extern int zldperm_dist(int, int, int_t, int_t [], int_t [], doublecomplex [], int_t *, double [], double []); extern int zstatic_schedule(superlu_dist_options_t *, int, int, zLUstruct_t *, gridinfo_t *, SuperLUStat_t *, @@ -476,31 +544,39 @@ extern int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_ zScalePermstruct_t *, Glu_persist_t *, gridinfo_t *, zSOLVEstruct_t *); extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int, int_t , int_t *, int_t, int_t, int_t, + int, int, int_t , int *fmod, int_t, int_t, int_t, int_t *, gridinfo_t *, zLocalLU_t *, MPI_Request [], SuperLUStat_t *); extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, zLocalLU_t *, MPI_Request [], SuperLUStat_t *); extern void zlsum_fmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int_t , int_t *, + int, int_t , int *fmod, int_t *, gridinfo_t *, zLocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int); extern void zlsum_fmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int, int_t , int_t *, int_t, + int, int, int_t , int *fmod, int_t, int_t *, gridinfo_t *, zLocalLU_t *, SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int); extern void zlsum_bmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, zLocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int); extern void zlsum_bmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int *bmod, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, zLocalLU_t *, SuperLUStat_t **, int_t, int_t, int, int); +extern void zComputeLevelsets(int , int_t , gridinfo_t *, + Glu_persist_t *, zLocalLU_t *, int_t *); + +#ifdef GPU_ACC +extern void zlsum_fmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, doublecomplex *, doublecomplex *, int, int, int_t , int *fmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, doublecomplex *, int64_t *, doublecomplex *, int64_t *, int_t *, int64_t *, int_t *, gridinfo_t *, doublecomplex * , doublecomplex * , int_t ); +extern void dlsum_bmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, doublecomplex *, doublecomplex *,int,int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *,int_t *, int64_t *, doublecomplex *, int64_t *, int_t *, int64_t *, Ucb_indptr_t *, int64_t *, doublecomplex *, int64_t *,int_t *,gridinfo_t *); +#endif + extern void pzgsrfs(int_t, SuperMatrix *, double, zLUstruct_t *, zScalePermstruct_t *, gridinfo_t *, doublecomplex [], int_t, doublecomplex [], int_t, int, @@ -578,6 +654,13 @@ extern int file_zPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A); extern void PrintDoublecomplex(char *, int_t, doublecomplex *); extern int file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *); +extern void zGenCOOLblocks(int, int_t, gridinfo_t*, + Glu_persist_t*, zLocalLU_t *, int_t** , int_t** , doublecomplex ** , int_t* , int_t* ); +extern void zGenCSCLblocks(int, int_t, gridinfo_t*, + Glu_persist_t*, zLocalLU_t *, doublecomplex **, int_t **, int_t **, int_t*, int_t*); +extern void zGenCSRLblocks(int, int_t, gridinfo_t*, + Glu_persist_t*, zLocalLU_t *, doublecomplex **, int_t **, int_t **, int_t*, int_t*); + /* BLAS */ @@ -638,7 +721,6 @@ extern int superlu_ztrsv(char *uplo, char *trans, char *diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx); #ifdef SLU_HAVE_LAPACK -// LAPACK routine extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*); #endif @@ -892,19 +974,13 @@ extern int_t zIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, MPI_Request *, zLocalLU_t *, int); extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*, zLocalLU_t *, gridinfo_t*, MPI_Request *, int); -extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, - MPI_Request *s, SCT_t*); -extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t zWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); -extern int_t Check_LRecv(MPI_Request*, int* msgcnt); extern int_t zWait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, gridinfo_t *, SCT_t*); extern int_t zISend_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, MPI_Request *, gridinfo_t *, int); extern int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, int_t src, gridinfo_t *, SCT_t*, int); -extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); -extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *, gridinfo_t *, zLocalLU_t *); extern int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, int_t size, @@ -912,16 +988,12 @@ extern int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, int_t size, extern int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, int_t src, MPI_Request *, gridinfo_t *, SCT_t*, int); -extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); -extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); extern int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, int_t size, int_t src, MPI_Request *, gridinfo_t*, SCT_t*, int); -extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); -extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); - extern int_t zUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, MPI_Request *, gridinfo_t *, zLUstruct_t *, SCT_t *); extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *); + #if (MPI_VERSION>2) extern int_t zIBcast_UDiagBlock(int_t k, doublecomplex *ublk_ptr, int_t size, MPI_Request *, gridinfo_t *); @@ -940,8 +1012,6 @@ extern int_t zDiagFactIBCast(int_t k, int_t k0, extern int_t zUPanelTrSolve( int_t k, doublecomplex* BlockLFactor, doublecomplex* bigV, int_t ldt, Ublock_info_t*, gridinfo_t *, zLUstruct_t *, SuperLUStat_t *, SCT_t *); -extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *, - gridinfo_t *, SCT_t *); extern int_t zLPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, MPI_Request *, doublecomplex* BlockUFactor, gridinfo_t *, zLUstruct_t *, SCT_t *); diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index c683feaf..3afa05ec 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -1031,7 +1031,7 @@ void printEtree(int_t nsuper, int_t *setree, treeList_t* treeList) { /* code */ // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); - fprintf(fp, " \"%d|%ld\" -> \"%ld|%ld\";\n", i, treeList[i].depth, + fprintf(fp, " \"%d|%d\" -> \"%ld|%ld\";\n", i, (int) treeList[i].depth, (long int) setree[i], (long int) treeList[setree[i]].depth); } diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c index 4a27e531..4ae520e2 100644 --- a/SRC/sutil_dist.c +++ b/SRC/sutil_dist.c @@ -542,11 +542,11 @@ void sinf_norm_error_dist(int_t n, int_t nrhs, float *x, int_t ldx, void Printfloat5(char *name, int_t len, float *x) { - register int_t i; + register int i; printf("%10s:", name); for (i = 0; i < len; ++i) { - if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4); + if ( i % 5 == 0 ) printf("\n[%d-%d] ", i, i+4); printf("%14e", x[i]); } printf("\n"); @@ -605,11 +605,11 @@ void sPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, } printf("(%d)", iam); PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); - PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); + PrintInt32("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); } - printf("nfrecvx " IFMT "\n", Llu->nfrecvx); + printf("nfrecvx %d\n", Llu->nfrecvx); k = CEILING( nsupers, grid->nprow ); - PrintInt10("fmod", k, Llu->fmod); + PrintInt32("fmod", k, Llu->fmod); } /* SPRINTLBLOCKS */ @@ -620,13 +620,13 @@ void sZeroLblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct) { float zero = 0.0; register int extra, gb, j, lb, nsupc, nsupr, ncb; - register int_t k, mycol, r; + register int k, mycol, r; sLocalLU_t *Llu = LUstruct->Llu; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; int_t *xsup = Glu_persist->xsup; int_t *index; float *nzval; - int_t nsupers = Glu_persist->supno[n-1] + 1; + int nsupers = Glu_persist->supno[n-1] + 1; ncb = nsupers / grid->npcol; extra = nsupers % grid->npcol; diff --git a/SRC/util.c b/SRC/util.c index 30175ee2..a0e39fc7 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -276,14 +276,14 @@ void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, int_t *segrep, int_t *repfnz) { - int_t j, k; + int j, k; for (j = jcol; j < jcol + w; j++) { - printf("\tcol " IFMT ":\n", j); + printf("\tcol %d:\n", j); for (k = 0; k < nseg; k++) - printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, - segrep[k], repfnz[(j - jcol) * n + segrep[k]]); + printf("\t\tseg %d, segrep %d, repfnz %d\n", k, + (int)segrep[k], (int)repfnz[(j - jcol) * n + segrep[k]]); } } @@ -537,11 +537,11 @@ void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, /*! \brief Get the statistics of the supernodes */ #define NBUCKS 10 -static int_t max_sup_size; +static int max_sup_size; void super_stats_dist(int_t nsuper, int_t *xsup) { - register int_t nsup1 = 0; + register int nsup1 = 0; int_t i, isize, whichb, bl, bh; int_t bucket[NBUCKS]; @@ -556,9 +556,9 @@ void super_stats_dist(int_t nsuper, int_t *xsup) max_sup_size = isize; } - printf(" Supernode statistics:\n\tno of super = " IFMT "\n", nsuper + 1); - printf("\tmax supernode size = " IFMT "\n", max_sup_size); - printf("\tno of size 1 supernodes = " IFMT "\n", nsup1); + printf(" Supernode statistics:\n\tno of super = %d\n", (int)nsuper + 1); + printf("\tmax supernode size = %d\n", max_sup_size); + printf("\tno of size 1 supernodes = %d\n", nsup1); /* Histogram of the supernode sizes */ ifill_dist(bucket, NBUCKS, 0); @@ -577,7 +577,7 @@ void super_stats_dist(int_t nsuper, int_t *xsup) { bl = (float)i * max_sup_size / NBUCKS; bh = (float)(i + 1) * max_sup_size / NBUCKS; - printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl + 1, bh, bucket[i]); + printf("\tsnode: %d-%d\t\t%d\n", (int)bl + 1, (int)bh, (int)bucket[i]); } } @@ -585,14 +585,14 @@ void super_stats_dist(int_t nsuper, int_t *xsup) */ void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz) { - int_t jj, k; + int jj, k; for (jj = jcol; jj < jcol + w; jj++) for (k = 0; k < n; k++) if (repfnz[(jj - jcol) * n + k] != EMPTY) { - fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n", - jj, k, repfnz[(jj - jcol) * n + k]); + fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", + jj, k, (int)repfnz[(jj - jcol) * n + k]); ABORT("check_repfnz_dist"); } } @@ -1032,8 +1032,8 @@ int_t estimate_bigu_size( #if (PRNTlevel >= 1) if (iam == 0) { - printf("max_ncols " IFMT ", max_ldu " IFMT ", bigu_size " IFMT "\n", - *max_ncols, max_ldu, max_ldu * (*max_ncols)); + printf("max_ncols %d, max_ldu %d, bigu_size " IFMT "\n", + (int)*max_ncols, (int)max_ldu, max_ldu * (*max_ncols)); fflush(stdout); } #endif diff --git a/SRC/zdistribute.c b/SRC/zdistribute.c index f88a9280..9b3885fe 100644 --- a/SRC/zdistribute.c +++ b/SRC/zdistribute.c @@ -111,19 +111,20 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, int *ToRecv, *ToSendD, **ToSendR; /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; + int *fmod; /* Modification count for L-solve. */ + int **fsendx_plist; /* Column process list to send down Xk. */ + int nfrecvx = 0; /* Number of Xk I will receive. */ + int nfsendx = 0; /* Number of Xk I will send */ + int kseen; /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ + int *bmod; /* Modification count for U-solve. */ + int **bsendx_plist; /* Column process list to send down Xk. */ + int nbrecvx = 0; /* Number of Xk I will receive. */ + int nbsendx = 0; /* Number of Xk I will send */ + + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ /*-- Auxiliary arrays; freed on return --*/ int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ @@ -149,8 +150,9 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, int_t iword, zword; float mem_use = 0.0; - int_t *mod_bit; - int_t *frecv, *brecv, *lloc; + int *mod_bit; + int *frecv, *brecv; + int_t *lloc; doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double *SeedSTD_BC,*SeedSTD_RD; @@ -345,7 +347,7 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Malloc fails for ToSendR[]."); j = k * grid->npcol; if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) - ABORT("Malloc fails for index[]."); + ABORT("Malloc fails for index1[]."); mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; @@ -489,9 +491,9 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Calloc fails for SPA dense[]."); /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) + if ( !(fmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) + if ( !(bmod = int32Calloc_dist(k)) ) ABORT("Calloc fails for bmod[]."); #if ( PRNTlevel>=1 ) mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*zword; @@ -521,28 +523,27 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, Linv_bc_ptr[k-1] = NULL; Uinv_bc_ptr[k-1] = NULL; - if ( !(Unnz = - (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + if ( !(Unnz = (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) ABORT("Malloc fails for Unnz[]."); /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for fsendx_plist[]."); len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + fsendx_plist[i] = &index1[j]; + if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) + if ( !(index1 = int32Malloc_dist(len)) ) ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0; i < len; ++i) index1[i] = EMPTY; for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; + bsendx_plist[i] = &index1[j]; - mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + mem_use += 4.0*k*sizeof(int*) + 2.0*len*sizeof(int); /*------------------------------------------------------------ PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. @@ -994,14 +995,15 @@ zdistribute(fact_t fact, int_t n, SuperMatrix *A, for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); - // BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z'); + //LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + //BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z'); C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z'); LBtree_ptr[ljb].tag_=BC_L; + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); // fflush(stdout); @@ -1051,9 +1053,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); /* construct the Reduce tree for L ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) + if ( !(frecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -1068,9 +1070,8 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - - + //MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) ) @@ -1182,8 +1183,8 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); // if(ib==0){ - // LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); - // RdTree_SetTag(LRtree_ptr[lib], RD_L,'z'); + //LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + //RdTree_SetTag(LRtree_ptr[lib], RD_L,'z'); C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z'); LRtree_ptr[lib].tag_=RD_L; // } @@ -1216,7 +1217,6 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); SUPERLU_FREE(mod_bit); SUPERLU_FREE(frecv); - SUPERLU_FREE(ActiveFlag); SUPERLU_FREE(ActiveFlagAll); SUPERLU_FREE(ranks); @@ -1341,8 +1341,8 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); // rseed=rand(); // rseed=1.0; msgsize = SuperSize( jb ); - // UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); - // BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z'); + //UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + //BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z'); C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'z'); UBtree_ptr[ljb].tag_=BC_U; @@ -1382,9 +1382,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); /* construct the Reduce tree for U ... */ /* the following is used as reference */ nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) + if ( !(mod_bit = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for mod_bit[]."); - if ( !(brecv = intMalloc_dist(nlb)) ) + if ( !(brecv = int32Malloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); for (k = 0; k < nlb; ++k) mod_bit[k] = 0; @@ -1399,9 +1399,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); } /* Every process receives the count, but it is only useful on the diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - - + //MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) ) @@ -1544,8 +1543,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); // if(ib==0){ - // URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); - // RdTree_SetTag(URtree_ptr[lib], RD_U,'z'); + //URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + //RdTree_SetTag(URtree_ptr[lib], RD_U,'z'); C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'z'); URtree_ptr[lib].tag_=RD_U; // } @@ -1568,7 +1567,6 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); SUPERLU_FREE(mod_bit); SUPERLU_FREE(brecv); - SUPERLU_FREE(ActiveFlag); SUPERLU_FREE(ActiveFlagAll); SUPERLU_FREE(ranks); @@ -1632,7 +1630,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); SUPERLU_FREE(dense); k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + if ( !(Llu->mod_bit = int32Malloc_dist(k)) ) ABORT("Malloc fails for mod_bit[]."); /* Find the maximum buffer size. */ diff --git a/SRC/zldperm_dist.c b/SRC/zldperm_dist.c index 4b7d3b81..ef124c6f 100644 --- a/SRC/zldperm_dist.c +++ b/SRC/zldperm_dist.c @@ -21,9 +21,10 @@ at the top-level directory. #include "superlu_zdefs.h" -extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], - int_t*, int_t [], int_t*, int_t[], int_t*, double [], - int_t [], int_t []); +extern int mc64ad_dist(int *job, int *n, int_t *ne, int_t *ip, + int_t *irn, double *a, int *num, int_t *cperm, + int_t *liw, int_t *iw, int_t *ldw, double *dw, + int * icntl, int *info); /*! \brief * @@ -91,11 +92,12 @@ extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], */ int -zldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], +zldperm_dist(int job, int n, int_t nnz, int_t colptr[], int_t adjncy[], doublecomplex nzval[], int_t *perm, double u[], double v[]) { - int_t i, liw, ldw, num; - int_t *iw, icntl[10], info[10]; + int i, num, icntl[10], info[10]; + int_t liw, ldw; + int_t *iw; double *dw; extern double *doubleMalloc_dist(int_t); double *nzval_abs = doubleMalloc_dist(nnz); @@ -148,7 +150,7 @@ zldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); #endif if ( info[0] == 1 ) { /* Structurally singular */ - printf(".. The last " IFMT " permutations:\n", n-num); + printf(".. The last %d permutations:\n", n-num); PrintInt10("perm", n-num, &perm[num]); } diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h index 880f6d4b..ef14048b 100644 --- a/SRC/zlustruct_gpu.h +++ b/SRC/zlustruct_gpu.h @@ -118,9 +118,9 @@ typedef struct //LUstruct_gpu_ double tHost_PCIeH2D; double tHost_PCIeD2H; - /*gpu events to measure DGEMM and SCATTER timing */ + /*GPU events to measure DGEMM and SCATTER timing */ int *isOffloaded; /*stores if any iteration is offloaded or not*/ - gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/ + gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/ gpuEvent_t *ePCIeH2D; gpuEvent_t *ePCIeD2H_Start; gpuEvent_t *ePCIeD2H_End; diff --git a/SRC/zsp_blas2_dist.c b/SRC/zsp_blas2_dist.c index 54a55bd3..0d54f1bc 100644 --- a/SRC/zsp_blas2_dist.c +++ b/SRC/zsp_blas2_dist.c @@ -8,8 +8,9 @@ All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file - * \brief Solves one of the systems of equations A*x = b, or A'*x = b + * \brief Sparse BLAS 2, using some dense BLAS 2 operations * *
  * -- Distributed SuperLU routine (version 1.0) --
@@ -19,7 +20,7 @@ at the top-level directory.
  */
 
 /*
- * File name:		sp_blas2.c
+ * File name:		zsp_blas2_dist.c
  * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
  */
 
@@ -30,19 +31,18 @@ at the top-level directory.
  * Function prototypes 
  */
 #ifndef USE_VENDOR_BLAS
-void zusolve(int, int, doublecomplex*, doublecomplex*);
-void zlsolve(int, int, doublecomplex*, doublecomplex*);
-void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
+extern void zusolve(int, int, doublecomplex*, doublecomplex*);
+extern void zlsolve(int, int, doublecomplex*, doublecomplex*);
+extern void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
 #endif
 
-
 /*! \brief
  *
  * 
  *   Purpose
  *   =======
  *
- *   sp_ztrsv() solves one of the systems of equations   
+ *   sp_ztrsv_dist() solves one of the systems of equations   
  *       A*x = b,   or   A'*x = b,
  *   where b and x are n element vectors and A is a sparse unit , or   
  *   non-unit, upper or lower triangular matrix.   
@@ -74,12 +74,12 @@ void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
  *	     
  *   L       - (input) SuperMatrix*
  *	       The factor L from the factorization Pr*A*Pc=L*U. Use
- *             compressed row subscripts storage for supernodes,
- *             i.e., L has types: Stype = SC, Dtype = Z, Mtype = TRLU.
+ *             compressed row subscripts storage for supernodes, i.e.,
+ *             L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU.
  *
  *   U       - (input) SuperMatrix*
  *	        The factor U from the factorization Pr*A*Pc=L*U.
- *	        U has types: Stype = NC, Dtype = Z, Mtype = TRU.
+ *	        U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU.
  *    
  *   x       - (input/output) doublecomplex*
  *             Before entry, the incremented array X must contain the n   
@@ -88,7 +88,7 @@ void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
  *
  *   info    - (output) int*
  *             If *info = -i, the i-th argument had an illegal value.
- * 
+ *
  */
 int
 sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
@@ -96,16 +96,14 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 {
 
 #ifdef _CRAY
-    _fcd ftcs1 = _cptofcd("L", strlen("L")),
-	 ftcs2 = _cptofcd("N", strlen("N")),
-	 ftcs3 = _cptofcd("U", strlen("U"));
+    _fcd ftcs1, ftcs2, ftcs3;
 #endif
     SCformat *Lstore;
     NCformat *Ustore;
     doublecomplex   *Lval, *Uval;
     int incx = 1, incy = 1;
     doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
-    doublecomplex comp_zero = {0.0, 0.0};
+    doublecomplex comp_zero = {0.0, 0.0}, comp_temp;
     int nrow;
     int fsupc, nsupr, nsupc, luptr, istart, irow;
     int i, k, iptr, jcol;
@@ -115,27 +113,27 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 
     /* Test the input parameters */
     *info = 0;
-    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U", 1) != 0 ) *info = -1;
-    else if ( strncmp(trans, "N", 1) != 0 && strncmp(trans, "T", 1) != 0 )
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U",1) !=0 ) *info = -1;
+    else if ( strncmp(trans, "N",1) !=0 && strncmp(trans, "T", 1) !=0 )
 	*info = -2;
-    else if ( strncmp(diag, "U", 1) != 0 && strncmp(diag, "N", 1) !=0 )
+    else if ( strncmp(diag, "U", 1) !=0 && strncmp(diag, "N", 1) != 0 )
 	*info = -3;
     else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
     else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
     if ( *info ) {
 	i = -(*info);
-	xerr_dist("sp_ztrsv", &i);
+	xerr_dist("sp_ztrsv_dist", &i);
 	return 0;
     }
 
-    Lstore = L->Store;
-    Lval = Lstore->nzval;
-    Ustore = U->Store;
-    Uval = Ustore->nzval;
+    Lstore = (SCformat *) L->Store;
+    Lval = (doublecomplex *) Lstore->nzval;
+    Ustore = (NCformat *) U->Store;
+    Uval = (doublecomplex *) Ustore->nzval;
     solve_ops = 0;
 
     if ( !(work = doublecomplexCalloc_dist(L->nrow)) )
-	ABORT("Malloc fails for work in sp_ztrsv().");
+	ABORT("Malloc fails for work in sp_dtrsv_dist().");
     
     if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
 	
@@ -150,20 +148,21 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 		nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
 		luptr = SuperLU_L_NZ_START(fsupc);
 		nrow = nsupr - nsupc;
-
 	        solve_ops += 4 * nsupc * (nsupc - 1);
 	        solve_ops += 8 * nrow * nsupc;
-
 		if ( nsupc == 1 ) {
 		    for (iptr=istart+1; iptr < SuperLU_L_SUB_START(fsupc+1); ++iptr) {
 			irow = SuperLU_L_SUB(iptr);
 			++luptr;
-			zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
+			zz_mult(&comp_temp, &x[fsupc], &Lval[luptr]);
+			z_sub(&x[irow], &x[irow], &comp_temp);
 		    }
 		} else {
 #ifdef USE_VENDOR_BLAS
 #ifdef _CRAY
+		    ftcs1 = _cptofcd("L", strlen("L"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    ftcs3 = _cptofcd("U", strlen("U"));
 		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
 		       	&x[fsupc], &incx);
 		
@@ -175,11 +174,11 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 		
 		    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
 		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
-#endif		
+#endif /* _CRAY */		
 #else
-		    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		    zlsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]);
 		
-		    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+		    zmatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
 			&x[fsupc], &work[0] );
 #endif		
 		
@@ -188,7 +187,6 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 			irow = SuperLU_L_SUB(iptr);
 			z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
 			work[i] = comp_zero;
-
 		    }
 	 	}
 	    } /* for k ... */
@@ -203,20 +201,21 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
 	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
 	    	luptr = SuperLU_L_NZ_START(fsupc);
-		
     	        solve_ops += 4 * nsupc * (nsupc + 1);
 
 		if ( nsupc == 1 ) {
 		    slud_z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
 		    for (i = SuperLU_U_NZ_START(fsupc); i < SuperLU_U_NZ_START(fsupc+1); ++i) {
 			irow = SuperLU_U_SUB(i);
-			zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
+			zz_mult(&comp_temp, &x[fsupc], &Uval[i]);
+			z_sub(&x[irow], &x[irow], &comp_temp);
 		    }
 		} else {
 #ifdef USE_VENDOR_BLAS
 #ifdef _CRAY
-		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		    ftcs1 = _cptofcd("U", strlen("U"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    CTRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
 		       &x[fsupc], &incx);
 #else
 		    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
@@ -231,8 +230,8 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 		    	for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); 
 				i++) {
 			    irow = SuperLU_U_SUB(i);
-			zz_mult(&comp_zero, &x[jcol], &Uval[i]);
-			z_sub(&x[irow], &x[irow], &comp_zero);
+			    zz_mult(&comp_temp, &x[jcol], &Uval[i]);
+			    z_sub(&x[irow], &x[irow], &comp_temp);
 		    	}
                     }
 		}
@@ -253,20 +252,20 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 	    	luptr = SuperLU_L_NZ_START(fsupc);
 
 		solve_ops += 8 * (nsupr - nsupc) * nsupc;
-
 		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
 		    iptr = istart + nsupc;
 		    for (i = SuperLU_L_NZ_START(jcol) + nsupc; 
 				i < SuperLU_L_NZ_START(jcol+1); i++) {
 			irow = SuperLU_L_SUB(iptr);
-			zz_mult(&comp_zero, &x[irow], &Lval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
+			zz_mult(&comp_temp, &x[irow], &Lval[i]);
+		    	z_sub(&x[jcol], &x[jcol], &comp_temp);
 			iptr++;
 		    }
 		}
 		
 		if ( nsupc > 1 ) {
 		    solve_ops += 4 * nsupc * (nsupc - 1);
+
 #ifdef USE_VENDOR_BLAS
 #ifdef _CRAY
                     ftcs1 = _cptofcd("L", strlen("L"));
@@ -298,13 +297,12 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 		    solve_ops += 8*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
 		    for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); i++) {
 			irow = SuperLU_U_SUB(i);
-			zz_mult(&comp_zero, &x[irow], &Uval[i]);
-		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
+			zz_mult(&comp_temp, &x[irow], &Uval[i]);
+		    	z_sub(&x[jcol], &x[jcol], &comp_temp);
 		    }
 		}
 
 		solve_ops += 4 * nsupc * (nsupc + 1);
-
 		if ( nsupc == 1 ) {
 		    slud_z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
 		} else {
@@ -331,17 +329,15 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
     /*SuperLUStat.ops[SOLVE] += solve_ops;*/
     SUPERLU_FREE(work);
     return 0;
-}
+} /* sp_ztrsv_dist */
 
 
-
-/*! \brief
-
+/*! \brief SpGEMV
 
   Purpose   
     =======   
 
-    sp_zgemv()  performs one of the matrix-vector operations   
+    sp_zgemv_dist()  performs one of the matrix-vector operations   
        y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
     where alpha and beta are scalars, x and y are vectors and A is a
     sparse A->nrow by A->ncol matrix.   
@@ -356,12 +352,14 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
                 TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
                 TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
 
-    ALPHA  - (input) doublecomplex
+    ALPHA  - (input) double
              On entry, ALPHA specifies the scalar alpha.   
 
     A      - (input) SuperMatrix*
-             Before entry, the leading m by n part of the array A must   
-             contain the matrix of coefficients.   
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_Z; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
 
     X      - (input) doublecomplex*, array of DIMENSION at least   
              ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
@@ -394,10 +392,10 @@ sp_ztrsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L,
 
*/ int -sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, - doublecomplex *x, int incx, doublecomplex beta, - doublecomplex *y, int incy) +sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x, + int incx, doublecomplex beta, doublecomplex *y, int incy) { + /* Local variables */ NCformat *Astore; doublecomplex *Aval; @@ -406,31 +404,30 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, int lenx, leny, i, j, irow; int iy, jx, jy, kx, ky; int notran; - doublecomplex comp_zero = {0.0, 0.0}; - doublecomplex comp_one = {1.0, 0.0}; + doublecomplex zero = {0.0, 0.0}; + doublecomplex one = {1.0, 0.0}; notran = (strncmp(trans, "N", 1)==0); - Astore = A->Store; - Aval = Astore->nzval; + Astore = (NCformat *) A->Store; + Aval = (doublecomplex *) Astore->nzval; /* Test the input parameters */ info = 0; - if ( !notran && strncmp(trans, "T", 1) != 0 && strncmp(trans, "C", 1) != 0) + if ( !notran && strncmp(trans, "T", 1) !=0 && strncmp(trans, "C", 1) != 0) info = 1; else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; else if (incx == 0) info = 5; else if (incy == 0) info = 8; if (info != 0) { - xerr_dist("sp_zgemv ", &info); + xerr_dist("sp_zgemv_dist ", &info); return 0; } /* Quick return if possible. */ - if ( A->nrow == 0 || A->ncol == 0 || - (z_eq(&alpha, &comp_zero) && z_eq(&beta, &comp_one)) ) + if (A->nrow == 0 || A->ncol == 0 || + (z_eq(&alpha, &zero) && z_eq(&beta, &one)) ) return 0; - /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if ( strncmp(trans, "N", 1)==0 ) { @@ -448,18 +445,17 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ - if ( !z_eq(&beta, &comp_one) ) { + if ( !z_eq(&beta, &one) ) { if (incy == 1) { - if ( z_eq(&beta, &comp_zero) ) - for (i = 0; i < leny; ++i) y[i] = comp_zero; + if ( z_eq(&beta, &zero) ) + for (i = 0; i < leny; ++i) y[i] = zero; else - for (i = 0; i < leny; ++i) - zz_mult(&y[i], &beta, &y[i]); + for (i = 0; i < leny; ++i) zz_mult(&y[i], &beta, &y[i]); } else { iy = ky; - if ( z_eq(&beta, &comp_zero) ) + if ( z_eq(&beta, &zero) ) for (i = 0; i < leny; ++i) { - y[iy] = comp_zero; + y[iy] = zero; iy += incy; } else @@ -470,14 +466,14 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, } } - if ( z_eq(&alpha, &comp_zero) ) return 0; + if ( z_eq(&alpha, &zero) ) return 0; if ( notran ) { /* Form y := alpha*A*x + y. */ jx = kx; if (incy == 1) { for (j = 0; j < A->ncol; ++j) { - if ( !z_eq(&x[jx], &comp_zero) ) { + if ( !z_eq(&x[jx], &zero) ) { zz_mult(&temp, &alpha, &x[jx]); for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; @@ -495,7 +491,7 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { - temp = comp_zero; + temp = zero; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; zz_mult(&temp1, &Aval[i], &x[irow]); @@ -510,5 +506,4 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, } } return 0; -} /* sp_zgemv */ - +} /* sp_zgemv_dist */ diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index 06d72c25..4ea207e4 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -17,18 +17,10 @@ #undef Reduce -//#include +//#include #include "zlustruct_gpu.h" -#if 0 -#ifdef HAVE_CUDA -#include "superlu_gpu_utils.cu" -#elif defined(HAVE_HIP) -#include "superlu_gpu_utils.hip.cpp" -#endif -#endif - #include "dcomplex.h" //extern "C" { @@ -41,7 +33,7 @@ // #if defined(DEBUG) || defined(_DEBUG) // if (result != GPUBLAS_STATUS_SUCCESS) // { -// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", gpublasGetErrorString(result)); +// fprintf(stderr, "GPU BLAS Runtime Error: %s\n", gpublasGetErrorString(result)); // assert(result == GPUBLAS_STATUS_SUCCESS); // } // #endif @@ -231,7 +223,7 @@ void Scatter_GPU_kernel( typedef int pfx_dtype ; extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); - + doublecomplex *tempv1; if (jj_st == jj0) { @@ -555,15 +547,10 @@ int zSchurCompUpdate_GPU( doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; /* The following are used in gpublasZgemm() call */ -#if 0 - hipblasDoubleComplex *cu_alpha = (hipblasDoubleComplex*) α - hipblasDoubleComplex *cu_beta = (hipblasDoubleComplex*) β - hipblasDoubleComplex *cu_A, *cu_B, *cu_C; /* C <- A*B */ -#else - gpuDoubleComplex *cu_alpha = (gpuDoubleComplex*) α - gpuDoubleComplex *cu_beta = (gpuDoubleComplex*) β + gpuDoubleComplex *cu_alpha = (gpuDoubleComplex *) α + gpuDoubleComplex *cu_beta = (gpuDoubleComplex *) β gpuDoubleComplex *cu_A, *cu_B, *cu_C; /* C <- A*B */ -#endif + int_t ii_st = 0; int_t ii_end = 0; int_t maxGemmBlockDim = (int) sqrt(buffer_size); @@ -661,15 +648,9 @@ int zSchurCompUpdate_GPU( assert(nrows * ncols <= buffer_size); gpublasSetStream(gpublas_handle0, FunCallStream); gpuEventRecord(A_gpu->GemmStart[k0], FunCallStream); -#if 0 - cu_A = (hipblasDoubleComplex*) &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row]; - cu_B = (hipblasDoubleComplex*) &A_gpu->scubufs[streamId].bigU[st_col * ldu]; - cu_C = (hipblasDoubleComplex*) A_gpu->scubufs[streamId].bigV; -#else cu_A = (gpuDoubleComplex*) &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row]; cu_B = (gpuDoubleComplex*) &A_gpu->scubufs[streamId].bigU[st_col * ldu]; cu_C = (gpuDoubleComplex*) A_gpu->scubufs[streamId].bigV; -#endif gpublasZgemm(gpublas_handle0, GPUBLAS_OP_N, GPUBLAS_OP_N, nrows, ncols, ldu, cu_alpha, cu_A, Rnbrow, cu_B, ldu, cu_beta, @@ -892,7 +873,7 @@ int zinitSluGPU3D_t( int_t ldt /* NSUP read from sp_ienv(3) */ ) { - checkGPUErrors(gpuDeviceReset ()) ; + checkGPUErrors(gpuDeviceReset ()); Glu_persist_t *Glu_persist = LUstruct->Glu_persist; zLocalLU_t *Llu = LUstruct->Llu; int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c index 6688710a..2506cd13 100644 --- a/SRC/zutil_dist.c +++ b/SRC/zutil_dist.c @@ -608,11 +608,11 @@ void zPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, } printf("(%d)", iam); PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); - PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); + PrintInt32("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); } - printf("nfrecvx " IFMT "\n", Llu->nfrecvx); + printf("nfrecvx %d\n", Llu->nfrecvx); k = CEILING( nsupers, grid->nprow ); - PrintInt10("fmod", k, Llu->fmod); + PrintInt32("fmod", k, Llu->fmod); } /* ZPRINTLBLOCKS */ @@ -623,13 +623,13 @@ void zZeroLblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct) { doublecomplex zero = {0.0, 0.0}; register int extra, gb, j, lb, nsupc, nsupr, ncb; - register int_t k, mycol, r; + register int k, mycol, r; zLocalLU_t *Llu = LUstruct->Llu; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; int_t *xsup = Glu_persist->xsup; int_t *index; doublecomplex *nzval; - int_t nsupers = Glu_persist->supno[n-1] + 1; + int nsupers = Glu_persist->supno[n-1] + 1; ncb = nsupers / grid->npcol; extra = nsupers % grid->npcol; diff --git a/example_scripts/run_cmake_build_debug.sh b/example_scripts/run_cmake_build_debug.sh index af86ac0b..4171ec01 100644 --- a/example_scripts/run_cmake_build_debug.sh +++ b/example_scripts/run_cmake_build_debug.sh @@ -17,7 +17,7 @@ fi cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so;${LIB_VTUNE}" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLAS=OFF \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_C_COMPILER=cc \ -DCMAKE_CXX_COMPILER=CC \ diff --git a/example_scripts/run_cmake_build_summit_gcc_gpu.sh b/example_scripts/run_cmake_build_summit_gcc_gpu.sh index 753fba7e..86e0a201 100644 --- a/example_scripts/run_cmake_build_summit_gcc_gpu.sh +++ b/example_scripts/run_cmake_build_summit_gcc_gpu.sh @@ -25,7 +25,7 @@ cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include;${OLCF_CUDA_ROOT}/include" \ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \ -DTPL_CUDA_LIBRARIES="${OLCF_CUDA_ROOT}/lib64/libcublas.so;${OLCF_CUDA_ROOT}/lib64/libcusparse.so;${OLCF_CUDA_ROOT}/lib64/libcudart.so" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpiCC \ diff --git a/example_scripts/run_cmake_build_summit_gcc_gpu_10.sh b/example_scripts/run_cmake_build_summit_gcc_gpu_10.sh index 73ec374e..24c48497 100644 --- a/example_scripts/run_cmake_build_summit_gcc_gpu_10.sh +++ b/example_scripts/run_cmake_build_summit_gcc_gpu_10.sh @@ -25,7 +25,7 @@ cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include;${OLCF_CUDA_ROOT}/include" \ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \ -DTPL_CUDA_LIBRARIES="${OLCF_CUDA_ROOT}/lib64/libcublas.so;${OLCF_CUDA_ROOT}/lib64/libcusparse.so;${OLCF_CUDA_ROOT}/lib64/libcudart.so" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLAS=OFF \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpiCC \ diff --git a/example_scripts/run_cmake_build_summit_gcc_nogpu.sh b/example_scripts/run_cmake_build_summit_gcc_nogpu.sh index dda6162c..f20a4e9e 100644 --- a/example_scripts/run_cmake_build_summit_gcc_nogpu.sh +++ b/example_scripts/run_cmake_build_summit_gcc_nogpu.sh @@ -24,7 +24,7 @@ rm -rf DartConfiguration.tcl cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include;${OLCF_CUDA_ROOT}/include" \ -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \ - -Denable_blaslib=OFF \ + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpiCC \