From 173c2608441f30f72b67f414a4424ab1a264ea79 Mon Sep 17 00:00:00 2001 From: liuyang Date: Wed, 14 Feb 2018 10:35:10 -0800 Subject: [PATCH] Double complex version added --- EXAMPLE/dcreate_matrix.c | 27 +- EXAMPLE/dcreate_matrix_perturbed.c | 6 +- EXAMPLE/pddrive.c | 32 +- EXAMPLE/pddrive1.c | 10 +- EXAMPLE/pddrive2.c | 15 +- EXAMPLE/pddrive3.c | 8 +- EXAMPLE/pddrive4.c | 13 +- EXAMPLE/pzdrive.c | 54 +- EXAMPLE/pzdrive1.c | 17 +- EXAMPLE/pzdrive2.c | 25 +- EXAMPLE/pzdrive3.c | 17 +- EXAMPLE/pzdrive4.c | 17 +- EXAMPLE/zcreate_matrix.c | 189 ++ EXAMPLE/zcreate_matrix_perturbed.c | 189 ++ SRC/CMakeLists.txt | 1 + SRC/Makefile | 2 +- SRC/pddistribute.c | 3321 +++++++++++------------ SRC/pdgssvx.c | 26 +- SRC/pdgstrs.c | 965 +++---- SRC/pdgstrs_lsum.c | 818 +++--- SRC/pdsymbfact_distdata.c | 1248 +++++---- SRC/pdutil.c | 15 +- SRC/pzdistribute.c | 918 ++++++- SRC/pzgssvx.c | 56 +- SRC/pzgstrs.c | 2154 ++++++++++----- SRC/pzgstrs_lsum.c | 1796 +++++++++++- SRC/pzsymbfact_distdata.c | 859 +++++- SRC/pzutil.c | 46 + SRC/superlu_ddefs.h | 34 +- SRC/superlu_dist_config.h | 10 +- SRC/superlu_zdefs.h | 50 +- TEST/pdtest.c | 57 +- TEST/pztest.c | 57 +- build/batch_script_mpi_runit_pureOMP.sh | 193 -- make.inc_good_static | 2 +- make.inc_tmp | 40 + 36 files changed, 8760 insertions(+), 4527 deletions(-) delete mode 100644 build/batch_script_mpi_runit_pureOMP.sh create mode 100644 make.inc_tmp diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c index d90185e8..67afe16b 100644 --- a/EXAMPLE/dcreate_matrix.c +++ b/EXAMPLE/dcreate_matrix.c @@ -63,8 +63,7 @@ at the top-level directory. * */ - - int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs, +int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs, int *ldb, double **x, int *ldx, FILE *fp, gridinfo_t *grid) { @@ -90,14 +89,14 @@ at the top-level directory. #endif if ( !iam ) { - double t = SuperLU_timer_(); + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); - /* Read the matrix stored on disk in Harwell-Boeing format. */ - dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); - printf("Time to read and distribute matrix %.2f\n", - SuperLU_timer_() - t); fflush(stdout); - + SuperLU_timer_() - t); fflush(stdout); + /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); @@ -235,9 +234,8 @@ at the top-level directory. return 0; } - - - + + int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs, int *ldb, double **x, int *ldx, FILE *fp, char * postfix, gridinfo_t *grid) @@ -264,8 +262,9 @@ int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs, #endif if ( !iam ) { - double t = SuperLU_timer_(); - if(!strcmp(postfix,"rua")){ + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"rua")){ /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); }else if(!strcmp(postfix,"mtx")){ @@ -286,7 +285,7 @@ int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs, printf("Time to read and distribute matrix %.2f\n", SuperLU_timer_() - t); fflush(stdout); - + /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); diff --git a/EXAMPLE/dcreate_matrix_perturbed.c b/EXAMPLE/dcreate_matrix_perturbed.c index 4c6ae016..00482198 100644 --- a/EXAMPLE/dcreate_matrix_perturbed.c +++ b/EXAMPLE/dcreate_matrix_perturbed.c @@ -258,7 +258,7 @@ int dcreate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, double **rhs, if ( !iam ) { double t = SuperLU_timer_(); - if(!strcmp(postfix,"rua")){ + if(!strcmp(postfix,"rua")){ /* Read the matrix stored on disk in Harwell-Boeing format. */ dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); }else if(!strcmp(postfix,"mtx")){ @@ -279,7 +279,7 @@ int dcreate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, double **rhs, printf("Time to read and distribute matrix %.2f\n", SuperLU_timer_() - t); fflush(stdout); - + /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); @@ -416,4 +416,4 @@ int dcreate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, double **rhs, CHECK_MALLOC(iam, "Exit dcreate_matrix()"); #endif return 0; -} \ No newline at end of file +} diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index a83ac9aa..0c666eef 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -22,7 +22,6 @@ at the top-level directory. */ #include -#include #include "superlu_ddefs.h" /*! \brief @@ -62,7 +61,7 @@ int main(int argc, char *argv[]) int m, n; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; + char **cpp, c, *postfix;; FILE *fp, *fopen(); int cpp_defs(); int ii; @@ -70,8 +69,8 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs =1; /* Number of right-hand side. */ - + nrhs = 1; /* Number of right-hand side. */ + /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ @@ -86,7 +85,7 @@ int main(int argc, char *argv[]) #if ( VTUNE>=1 ) __itt_pause(); #endif - + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -116,7 +115,7 @@ int main(int argc, char *argv[]) INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); - + if(grid.iam==0){ MPI_Query_thread(&omp_mpi_level); switch (omp_mpi_level) { @@ -138,7 +137,7 @@ int main(int argc, char *argv[]) break; } } - + /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; @@ -167,7 +166,6 @@ int main(int argc, char *argv[]) } // printf("%s\n", postfix); - /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ @@ -192,24 +190,19 @@ int main(int argc, char *argv[]) options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; + options.DiagInv = NO; */ set_default_options_dist(&options); + // options.DiagInv = YES; + #if 0 - options.ColPerm = PARMETIS; - options.ParSymbFact = YES; options.RowPerm = NOROWPERM; options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; options.Equil = NO; + options.ReplaceTinyPivot = YES; #endif -// // options.ParSymbFact = YES; -// // options.ColPerm = PARMETIS; -// // options.RowPerm = NOROWPERM; - // options.IterRefine = 0; -// // options.DiagInv = YES; - // options.ReplaceTinyPivot = NO; - // options.SymPattern = YES; - if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); @@ -244,7 +237,8 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); - Destroy_LU(n, &grid, &LUstruct); + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index cf43cd90..6721fde2 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -22,7 +22,6 @@ at the top-level directory. */ #include -#include #include "superlu_ddefs.h" /*! \brief @@ -129,13 +128,12 @@ int main(int argc, char *argv[]) } } // printf("%s\n", postfix); - + /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - - if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) + if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); for (j = 0; j < nrhs; ++j) for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; @@ -160,7 +158,6 @@ int main(int argc, char *argv[]) options.PrintStat = YES; */ set_default_options_dist(&options); - printf("options.ColPerm = %d\n", options.ColPerm); if (!iam) { print_sp_ienv_dist(&options); @@ -216,7 +213,8 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); - Destroy_LU(n, &grid, &LUstruct); + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c index 7e45dcfc..e688ac01 100644 --- a/EXAMPLE/pddrive2.c +++ b/EXAMPLE/pddrive2.c @@ -23,7 +23,6 @@ at the top-level directory. */ #include -#include #include "superlu_ddefs.h" /*! \brief @@ -70,8 +69,8 @@ int main(int argc, char *argv[]) /* prototypes */ extern int dcreate_matrix_perturbed (SuperMatrix *, int, double **, int *, double **, int *, - FILE *, gridinfo_t *); - extern int dcreate_matrix_perturbed_postfix + FILE *, gridinfo_t *); + extern int dcreate_matrix_perturbed_postfix (SuperMatrix *, int, double **, int *, double **, int *, FILE *, char *, gridinfo_t *); @@ -137,9 +136,9 @@ int main(int argc, char *argv[]) } } // printf("%s\n", postfix); - + /* ------------------------------------------------------------ - GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. ------------------------------------------------------------*/ dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); @@ -191,7 +190,8 @@ int main(int argc, char *argv[]) PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ - Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ SUPERLU_FREE(b); /* Free storage of right-hand side. */ SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ @@ -239,7 +239,8 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ - Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c index f2e40aad..e3ef0f4b 100644 --- a/EXAMPLE/pddrive3.c +++ b/EXAMPLE/pddrive3.c @@ -22,7 +22,6 @@ at the top-level directory. */ #include -#include #include "superlu_ddefs.h" /*! \brief @@ -68,7 +67,7 @@ int main(int argc, char *argv[]) int_t i, j, ii, m, n, nnz_loc, m_loc, fst_row; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix;; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -134,7 +133,7 @@ int main(int argc, char *argv[]) } } // printf("%s\n", postfix); - + /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ @@ -240,7 +239,8 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ - Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with the L and U matrices. */ ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index 82a0b14d..6242a73a 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -64,7 +64,7 @@ int main(int argc, char *argv[]) int_t usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ - char **cpp, c, *postfix;; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -134,14 +134,12 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif - for(ii = 0;ii= 0 && iam < 6 ) { /* I am in grid 1. */ iam = grid1.iam; /* Get the logical number in the new grid. */ @@ -149,9 +147,8 @@ int main(int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); - - + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); + if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); @@ -205,6 +202,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); + dDestroy_Tree(n, &grid1, &LUstruct); Destroy_LU(n, &grid1, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { @@ -220,7 +218,7 @@ int main(int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); @@ -270,6 +268,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); + dDestroy_Tree(n, &grid2, &LUstruct); Destroy_LU(n, &grid2, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index b1785b8e..8de98873 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -60,10 +60,12 @@ int main(int argc, char *argv[]) int m, n; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c; + char **cpp, c, *postfix;; FILE *fp, *fopen(); int cpp_defs(); - + int ii; + int omp_mpi_level; + nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ @@ -72,7 +74,17 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); + //MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( VTUNE>=1 ) + __itt_pause(); +#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -102,7 +114,29 @@ int main(int argc, char *argv[]) INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); - + + if(grid.iam==0){ + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; @@ -124,10 +158,17 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif + for(ii = 0;ii -#include #include "superlu_zdefs.h" /*! \brief @@ -55,10 +54,10 @@ int main(int argc, char *argv[]) gridinfo_t grid; double *berr; doublecomplex *b, *xtrue, *b1; - int i, j, m, n; + int i, j, m, n, ii; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -122,10 +121,17 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif + for(ii = 0;ii -#include #include "superlu_zdefs.h" /*! \brief @@ -59,10 +58,10 @@ int main(int argc, char *argv[]) double *berr; doublecomplex *b, *b1, *xtrue, *xtrue1; int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc; + int_t i, j, ii, m, n, nnz_loc, m_loc; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -70,6 +69,9 @@ int main(int argc, char *argv[]) extern int zcreate_matrix_perturbed (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, gridinfo_t *); + extern int zcreate_matrix_perturbed_postfix + (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, + FILE *, char *, gridinfo_t *); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ @@ -127,10 +129,17 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif + for(ii = 0;ii -#include #include "superlu_zdefs.h" /*! \brief @@ -64,10 +63,10 @@ int main(int argc, char *argv[]) double *berr; doublecomplex *b, *b1, *xtrue, *nzval, *nzval1; int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc, fst_row; + int_t i, j, ii, m, n, nnz_loc, m_loc, fst_row; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -127,10 +126,17 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif + for(ii = 0;ii= 0 && iam < 6 ) { /* I am in grid 1. */ iam = grid1.iam; /* Get the logical number in the new grid. */ /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid1); + zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); @@ -194,6 +201,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); + zDestroy_Tree(n, &grid1, &LUstruct); Destroy_LU(n, &grid1, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { @@ -209,7 +217,7 @@ int main(int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - zcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid2); + zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); @@ -259,6 +267,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); ScalePermstructFree(&ScalePermstruct); + zDestroy_Tree(n, &grid2, &LUstruct); Destroy_LU(n, &grid2, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { diff --git a/EXAMPLE/zcreate_matrix.c b/EXAMPLE/zcreate_matrix.c index 86601437..91512b12 100644 --- a/EXAMPLE/zcreate_matrix.c +++ b/EXAMPLE/zcreate_matrix.c @@ -232,3 +232,192 @@ int zcreate_matrix(SuperMatrix *A, int nrhs, doublecomplex **rhs, #endif return 0; } + + + +int zcreate_matrix_postfix(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char * postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter zcreate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"cua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + +#if 0 + nzval[0].r = 0.1; nzval[0].i = 0.0; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) zPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit zcreate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/zcreate_matrix_perturbed.c b/EXAMPLE/zcreate_matrix_perturbed.c index 92b79635..8ae551fe 100644 --- a/EXAMPLE/zcreate_matrix_perturbed.c +++ b/EXAMPLE/zcreate_matrix_perturbed.c @@ -227,3 +227,192 @@ int zcreate_matrix_perturbed(SuperMatrix *A, int nrhs, doublecomplex **rhs, #endif return 0; } + + + +int zcreate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char *postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter zcreate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + if(!strcmp(postfix,"cua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0].r *= 0.01; nzval[0].i *= 0.01; + nzval[nnz-1].r *= 0.0001; nzval[nnz-1].i *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) zPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit zcreate_matrix()"); +#endif + return 0; +} diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 400921d4..78eb3886 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -111,6 +111,7 @@ if(enable_complex16) zreadhb.c zreadrb.c zreadtriple.c + zbinary_io.c zreadMM.c pzgsequ.c pzlaqgs.c diff --git a/SRC/Makefile b/SRC/Makefile index eb0c0021..a6cf1cbc 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -62,7 +62,7 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ # # Routines for double complex parallel SuperLU ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ - zreadhb.o zreadrb.o zreadtriple.o zreadMM.o \ + zreadhb.o zreadrb.o zreadtriple.o zreadMM.o zbinary_io.o\ pzgsequ.o pzlaqgs.o zldperm_dist.o pzlangs.o pzutil.o \ pzsymbfact_distdata.o zdistribute.o pzdistribute.o \ pzgstrf.o pzgstrf2.o pzGetDiagU.o \ diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c index d9292447..9309fb1d 100644 --- a/SRC/pddistribute.c +++ b/SRC/pddistribute.c @@ -7,7 +7,7 @@ All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. - */ +*/ /*! @file @@ -23,8 +23,7 @@ at the top-level directory. #ifndef CACHELINE #define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ -#endif - +#endif /*! \brief * @@ -61,346 +60,349 @@ at the top-level directory. * ============ * */ - int_t +int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, - gridinfo_t *grid, int_t *colptr[], int_t *rowind[], - double *a[]) + Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, + gridinfo_t *grid, int_t *colptr[], int_t *rowind[], + double *a[]) { - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; - int_t nnz_loc; /* number of local nonzeros */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; - double *nzval_a; - int iam, it, p, procs; - MPI_Request *send_req; - MPI_Status status; - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; + int_t nnz_loc; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ptr_to_send; + double *aij, **aij_send, *nzval, *dtemp; + double *nzval_a; + int iam, it, p, procs; + MPI_Request *send_req; + MPI_Status status; + + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dReDistribute_A()"); + CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - nnzToRecv = intCalloc_dist(2*procs); - nnzToSend = nnzToRecv + procs; - - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; - } + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nnzToRecv = intCalloc_dist(2*procs); + nnzToSend = nnzToRecv + procs; + + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; } + } - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ - } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate space for storing the triplets after redistribution. */ - if ( k ) { /* count can be zero. */ - if ( !(ia = intMalloc_dist(2*k)) ) - ABORT("Malloc fails for ia[]."); - if ( !(aij = doubleMalloc_dist(k)) ) - ABORT("Malloc fails for aij[]."); - } - ja = ia + k; - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) - ABORT("Malloc fails for ia_send[]."); - if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) - ABORT("Malloc fails for aij_send[]."); - if ( SendCnt ) { /* count can be zero */ - if ( !(index = intMalloc_dist(2*SendCnt)) ) - ABORT("Malloc fails for index[]."); - if ( !(nzval = doubleMalloc_dist(SendCnt)) ) - ABORT("Malloc fails for nzval[]."); - } - if ( !(ptr_to_send = intCalloc_dist(procs)) ) - ABORT("Malloc fails for ptr_to_send[]."); - if ( maxnnzToRecv ) { /* count can be zero */ - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) - ABORT("Malloc fails for itemp[]."); - if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) - ABORT("Malloc fails for dtemp[]."); - } - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - if ( !(*colptr = intCalloc_dist(n+1)) ) - ABORT("Malloc fails for *colptr[]."); - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - ++(*colptr)[jcol]; /* Count nonzeros in each column */ - } - } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. -NOTE: Can possibly use MPI_Alltoallv. -------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, MPI_DOUBLE, - p, iam+procs, grid->comm, &send_req[procs+p] ); - } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /*assert(jcol 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - if ( SendCnt ) { - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - } - SUPERLU_FREE(ptr_to_send); - if ( maxnnzToRecv ) { - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - } + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. - ------------------------------------------------------------*/ - if ( nnz_loc ) { /* nnz_loc can be zero */ - if ( !(*rowind = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *rowind[]."); - if ( !(*a = doubleMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *a[]."); + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + + /* Allocate space for storing the triplets after redistribution. */ + if ( k ) { /* count can be zero. */ + if ( !(ia = intMalloc_dist(2*k)) ) + ABORT("Malloc fails for ia[]."); + if ( !(aij = doubleMalloc_dist(k)) ) + ABORT("Malloc fails for aij[]."); + } + ja = ia + k; + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) + ABORT("Malloc fails for ia_send[]."); + if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) + ABORT("Malloc fails for aij_send[]."); + if ( SendCnt ) { /* count can be zero */ + if ( !(index = intMalloc_dist(2*SendCnt)) ) + ABORT("Malloc fails for index[]."); + if ( !(nzval = doubleMalloc_dist(SendCnt)) ) + ABORT("Malloc fails for nzval[]."); + } + if ( !(ptr_to_send = intCalloc_dist(procs)) ) + ABORT("Malloc fails for ptr_to_send[]."); + if ( maxnnzToRecv ) { /* count can be zero */ + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) + ABORT("Malloc fails for itemp[]."); + if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) + ABORT("Malloc fails for dtemp[]."); + } + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + if ( !(*colptr = intCalloc_dist(n+1)) ) + ABORT("Malloc fails for *colptr[]."); + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + ++(*colptr)[jcol]; /* Count nonzeros in each column */ + } } - - /* Initialize the array of column pointers */ - k = 0; - jsize = (*colptr)[0]; - (*colptr)[0] = 0; - for (j = 1; j < n; ++j) { - k += jsize; - jsize = (*colptr)[j]; - (*colptr)[j] = k; + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + NOTE: Can possibly use MPI_Alltoallv. + ------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_DOUBLE, + p, iam+procs, grid->comm, &send_req[procs+p] ); } - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - j = ja[i]; - k = (*colptr)[j]; - (*rowind)[k] = ia[i]; - (*a)[k] = aij[i]; - ++(*colptr)[j]; + } + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToRecv[p]; + MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /*assert(jcol 0; --j) (*colptr)[j] = (*colptr)[j-1]; - (*colptr)[0] = 0; - - if ( nnz_loc ) { - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + MPI_Wait( &send_req[p], &status); + MPI_Wait( &send_req[procs+p], &status); } + } + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE + ------------------------------------------------------------*/ + + SUPERLU_FREE(nnzToRecv); + + if ( procs > 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + if ( SendCnt ) { + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + } + SUPERLU_FREE(ptr_to_send); + if ( maxnnzToRecv ) { + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + } + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. + ------------------------------------------------------------*/ + if ( nnz_loc ) { /* nnz_loc can be zero */ + if ( !(*rowind = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *rowind[]."); + if ( !(*a = doubleMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *a[]."); + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = (*colptr)[0]; + (*colptr)[0] = 0; + for (j = 1; j < n; ++j) { + k += jsize; + jsize = (*colptr)[j]; + (*colptr)[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + j = ja[i]; + k = (*colptr)[j]; + (*rowind)[k] = ia[i]; + (*a)[k] = aij[i]; + ++(*colptr)[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; + (*colptr)[0] = 0; + + if ( nnz_loc ) { + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); + } #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dReDistribute_A()"); + CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif - - return 0; + + return 0; } /* dReDistribute_A */ - float +float pddistribute(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, - gridinfo_t *grid, int_t nrhs) - /* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ + ScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, + gridinfo_t *grid, int_t nrhs) +/* + * -- Distributed SuperLU routine (version 2.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * March 15, 2003 + * + * + * Purpose + * ======= + * Distribute the matrix onto the 2D process mesh. + * + * Arguments + * ========= + * + * fact (input) fact_t + * Specifies whether or not the L and U structures will be re-used. + * = SamePattern_SameRowPerm: L and U structures are input, and + * unchanged on exit. + * = DOFACT or SamePattern: L and U structures are computed and output. + * + * n (input) int + * Dimension of the matrix. + * + * A (input) SuperMatrix* + * The distributed input matrix A of dimension (A->nrow, A->ncol). + * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: + * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. + * + * ScalePermstruct (input) ScalePermstruct_t* + * The data structure to store the scaling and permutation vectors + * describing the transformations performed to the original matrix A. + * + * Glu_freeable (input) *Glu_freeable_t + * The global structure describing the graph of L and U. + * + * LUstruct (input) LUstruct_t* + * Data structures for L and U factors. + * + * grid (input) gridinfo_t* + * The 2D process mesh. + * + * Return value + * ============ + * > 0, working storage required (in bytes). + * + */ { - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, - len, len1, nsupc; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; int_t lib; /* local block row number */ int_t nlb; /* local block rows*/ - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol,krow, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - double *a; - int_t *asub, *xa; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *usub1, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - int_t *index_srt; /* indices consist of headers and row subscripts */ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + double *a; + int_t *asub, *xa; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ int *index1; /* temporary pointer to array of int */ - double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ @@ -409,36 +411,35 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - - /*-- Counts to be used in factorization. --*/ - int *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ int_t *ActiveFlag; int_t *ActiveFlagAll; int_t Iactive; @@ -446,12 +447,13 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, int_t *idxs; int_t **nzrows; double rseed; - int rank_cnt,rank_cnt_ref,Root; + int rank_cnt,rank_cnt_ref,Root; double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float mem_use = 0.0; + double zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + int_t *mod_bit; int_t *frecv, *brecv, *lloc; double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ @@ -462,1525 +464,1488 @@ pddistribute(fact_t fact, int_t n, SuperMatrix *A, int_t ik, il, lk, rel, knsupc, idx_r; int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; int_t nub; - int tag; + int tag; #if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; + int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; + double t, t_u, t_l; + int_t u_blks; #endif - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = (NRformat_loc *) A->Store; - - // #if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - dword = sizeof(double); - - aln_i = ceil(CACHELINE/(double)iword); + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = (NRformat_loc *) A->Store; - // #endif +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + dword = sizeof(double); + aln_i = ceil(CACHELINE/(double)iword); +//#endif #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pddistribute()"); + CHECK_MALLOC(iam, "Enter pddistribute()"); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, - grid, &xa, &asub, &a); + dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, + grid, &xa, &asub, &a); #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf("--------\n" - ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam ) printf("--------\n" + ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); #endif - if ( fact == SamePattern_SameRowPerm ) { + if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; + t_l = t_u = 0; u_blks = 0; #endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) - mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ #if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); #endif - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ #if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; + t_l += SuperLU_timer_() - t; #endif - } /* if mycol == pc */ - } /* for jb ... */ + } /* if mycol == pc */ + } /* for jb ... */ - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); #endif - } else { - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ + } else { + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; + t_l = t_u = 0; u_blks = 0; #endif - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) - ABORT("Malloc fails for ToRecv[]."); - for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) - ABORT("Malloc fails for index[]."); + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) - mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif - for (i = 0; i < j; ++i) index1[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) - ABORT("Malloc fails for ToSendD[]."); - for (i = 0; i < k; ++i) ToSendD[i] = NO; - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) - mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; + mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; #endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ #if ( PRNTlevel>=1 ) - ++nUblocks; + ++nUblocks; #endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) - mem_use -= 2.0*k * iword; + mem_use -= 2.0*k * iword; #endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); - - /* ------------------------------------------------ */ + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); + + /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) - mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - if ( !(Lindval_loc_bc_ptr = - (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); - Lindval_loc_bc_ptr[k-1] = NULL; - - if ( !(Linv_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); - } - if ( !(Uinv_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); - } - Linv_bc_ptr[k-1] = NULL; - Uinv_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) - mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } /* for j ... */ + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } /* for j ... */ - jbrow = PROW( jb, grid ); + jbrow = PROW( jb, grid ); - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - // if(ljb==0){ - // printf("no here??\n"); - // fflush(stdout); - // } - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ #if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); #endif - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - // if(gb==747)printf("worita %5d%5d",iam,jb); - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } #if ( PRNTlevel>=1 ) - ++nLblocks; + ++nLblocks; #endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - if (!(lusup = - doubleMalloc_dist(len*nsupc))) { - fprintf(stderr, "col block " IFMT " ", jb); - ABORT("Malloc fails for lusup[]"); - } - // if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) - if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) - ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); - - - - - if (!(Linv_bc_ptr[ljb] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[*][] col block " IFMT, jb); - } - if (!(Uinv_bc_ptr[ljb] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[*][] col block " IFMT, jb); - } - - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - - - Lindval_loc_bc_ptr[ljb][k] = lb; - Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; - Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; - - // if(ljb==0){ - // printf("lb %5d, ind %5d, val %5d\n",lb,next_lind,next_lval); - // fflush(stdout); - // } - - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - - - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } - } /* for i ... */ - - Lrowind_bc_ptr[ljb] = index; - Lnzval_bc_ptr[ljb] = lusup; - - - /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ - if(nrbl>1){ - krow = PROW( jb, grid ); - if(myrow==krow){ /* skip the diagonal block */ - uu=nrbl-2; - lloc = &Lindval_loc_bc_ptr[ljb][1]; - }else{ - uu=nrbl-1; - lloc = Lindval_loc_bc_ptr[ljb]; - } - quickSortM(lloc,0,uu,nrbl,0,3); - } - - - if ( !(index_srt = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index_srt[]"); - if (!(lusup_srt = doubleMalloc_dist(len*nsupc))) - ABORT("Malloc fails for lusup_srt[]"); - - idx_indx = BC_HEADER; - idx_lusup = 0; - for (jj=0;jj1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } - for(i=0;i=1 ) - t_l += SuperLU_timer_() - t; + t_l += SuperLU_timer_() - t; #endif - } /* if mycol == pc */ - - } /* for jb ... */ + } /* if mycol == pc */ - // for (j=0;j<19*3;j++){ - // printf("Lindval %5d\n",Lindval_loc_bc_ptr[0][j]); - // fflush(stdout); - // } + } /* for jb ... */ - - ///////////////////////////////////////////////////////////////// - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub1 = Ufstnz_br_ptr[lk]; - if ( usub1 ) { /* Not an empty block row. */ - /* usub1[0] -- number of column blocks in this block row. */ - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ - k = usub1[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); } + } + } - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub1 = Ufstnz_br_ptr[lk]; - if ( usub1 ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - - for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ - k = usub1[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - - ++Urbs1[ljb]; - j += usub1[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - ///////////////////////////////////////////////////////////////// - - // if(LSUM=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Bcast tree for L ... */ - - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - ABORT("Malloc fails for LBtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); - if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_BC[]."); + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); - MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm); + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + - if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) ) - ABORT("Calloc fails for ActiveFlag[]."); - for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; - for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); - } /* for j ... */ - } - } - - for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow-1,grid->nprow,0,2); - for (j=0;jnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; - for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; - for (j=0;jnprow;++j)ranks[j]=-1; - - Root=-1; - Iactive = 0; - for (j=0;jnprow;++j){ - if(ActiveFlag[j]!=3*nsupers){ - gb = ActiveFlag[j]; - pr = PROW( gb, grid ); - if(gb==jb)Root=pr; - if(myrow==pr)Iactive=1; - } + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; } - + } - quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + if(rank_cnt>1){ - if(Iactive==1){ - // printf("jb %5d damn\n",jb); - // fflush(stdout); - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->nprow; ++j){ - if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; - ++rank_cnt; - } - } - - if(rank_cnt>1){ - - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); - BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d'); - - // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); - - // if(iam==15 || iam==3){ - // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'d')); - // fflush(stdout); - // } - - // #if ( PRNTlevel>=1 ) - if(Root==myrow){ - rank_cnt_ref=1; - for (j = 0; j < grid->nprow; ++j) { - if ( fsendx_plist[ljb][j] != EMPTY ) { - ++rank_cnt_ref; - } - } - assert(rank_cnt==rank_cnt_ref); - - // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); - - // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;jcomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'d')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; } - // #endif - } - } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); #endif - + #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Reduce tree for L ... */ - /* the following is used as reference */ - nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - pr = PROW( k, grid ); - if ( myrow == pr ) { - lib = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if (mycol == kcol || fmod[lib] ) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } - } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - ABORT("Malloc fails for LRtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); - // if ( !(idxs = intCalloc_dist(nsupers)) ) - // ABORT("Calloc fails for idxs[]."); + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); - // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - // ABORT("Malloc fails for nzrows[]."); + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); - if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_RD[]."); + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); - for (i=0;irscp.comm); + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); - // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - // fsupc = FstBlockC( jb ); - // len=xlsub[fsupc+1]-xlsub[fsupc]; - // idxs[jb] = len-1; - // if(len>0){ - // if ( !(nzrows[jb] = intMalloc_dist(len)) ) - // ABORT("Malloc fails for nzrows[jb]"); - // for(i=xlsub[fsupc];i0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); } + } + } - - if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; - - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - fsupc = FstBlockC( jb ); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; pc = PCOL( jb, grid ); - for(i=xlsub[fsupc];inpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); - } - } + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } } - - - for (lib=0;libnprow; /* not sure */ - if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; - for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; - for (j=0;jnpcol;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - - for (j=0;jnpcol;++j){ - if(ActiveFlag[j]!=-3*nsupers){ - jb = ActiveFlag[j]; - pc = PCOL( jb, grid ); - if(jb==ib)Root=pc; - if(mycol==pc)Iactive=1; - } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; } - - - quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); - - if(Iactive==1){ - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->npcol; ++j){ - if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; - ++rank_cnt; - } - } - if(rank_cnt>1){ + } + if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); - RdTree_SetTag(LRtree_ptr[lib], RD_L,'d'); - // } + LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'d'); + // } - // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); - // if(ib==15 || ib ==16){ + // if(ib==15 || ib ==16){ - // if(iam==15 || iam==3){ - // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'d')); - // fflush(stdout); - // } + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'d')); + // fflush(stdout); + // } - // #if ( PRNTlevel>=1 ) - // if(Root==mycol){ - // assert(rank_cnt==frecv[lib]); - // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // // for(j=0;j=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Bcast tree for U ... */ + /* construct the Bcast tree for U ... */ - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - ABORT("Malloc fails for UBtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); - if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_BC[]."); + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); - for (i=0;icscp.comm); + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm); - for (ljb = 0; ljb nprow*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } - for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); - // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); - // fflush(stdout); - //if(gb==jb)Root=pr; - } - - + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; } - pr = PROW( jb, grid ); // take care of diagonal node stored as L - // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); // fflush(stdout); - ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); - } - } - - - for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; - for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; - for (j=0;jnprow;++j)ranks[j]=-1; - - Root=-1; - Iactive = 0; - for (j=0;jnprow;++j){ - if(ActiveFlag[j]!=-3*nsupers){ - gb = ActiveFlag[j]; - pr = PROW( gb, grid ); - if(gb==jb)Root=pr; - if(myrow==pr)Iactive=1; - } - } - - quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); - // printf("jb: %5d Iactive %5d\n",jb,Iactive); - // fflush(stdout); - if(Iactive==1){ - // printf("root:%5d jb: %5d\n",Root,jb); + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); // fflush(stdout); - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->nprow; ++j){ - if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; - ++rank_cnt; - } - } - // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); - // fflush(stdout); - if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); - BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d'); - - // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); - - if(Root==myrow){ - rank_cnt_ref=1; - for (j = 0; j < grid->nprow; ++j) { - // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); - // fflush(stdout); - if ( bsendx_plist[ljb][j] != EMPTY ) { - ++rank_cnt_ref; - } - } - // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); - // fflush(stdout); - assert(rank_cnt==rank_cnt_ref); - } + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; } } - } - } - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); - SUPERLU_FREE(SeedSTD_BC); - + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Reduce tree for U ... */ - /* the following is used as reference */ - nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for mod_bit[]."); - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - pr = PROW( k, grid ); - if ( myrow == pr ) { - lib = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if (mycol == kcol || bmod[lib] ) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } - } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - ABORT("Malloc fails for URtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); - // if ( !(idxs = intCalloc_dist(nsupers)) ) - // ABORT("Calloc fails for idxs[]."); + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); - // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - // ABORT("Malloc fails for nzrows[]."); + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); - if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_RD[]."); + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); - for (i=0;irscp.comm); - - - // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - // fsupc = FstBlockC( jb ); - // len=0; - // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - // istart = xusub[j]; - // /* NOTE: Only the first nonzero index of the segment - // is stored in usub[]. */ - // len += xusub[j+1] - xusub[j]; - // } - - // idxs[jb] = len-1; - - // if(len>0){ - // if ( !(nzrows[jb] = intMalloc_dist(len)) ) - // ABORT("Malloc fails for nzrows[jb]"); - - // fsupc = FstBlockC( jb ); - - // len=0; - - // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - // istart = xusub[j]; - // /* NOTE: Only the first nonzero index of the segment - // is stored in usub[]. */ - // for (i = istart; i < xusub[j+1]; ++i) { - // irow = usub[i]; /* First nonzero in the segment. */ - // nzrows[jb][len]=irow; - // len++; - // } - // } - // quickSort(nzrows[jb],0,len-1,0); - // } - // else{ - // nzrows[jb] = NULL; - // } - // } - + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); - for (lib = 0; lib npcol*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; - - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - fsupc = FstBlockC( jb ); - pc = PCOL( jb, grid ); + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } - fsupc = FstBlockC( jb ); - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - ib = BlockNum( irow ); - pr = PROW( ib, grid ); - if ( myrow == pr ) { /* Block row ib in my process row */ - lib = LBi( ib, grid ); /* Local block number */ - ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); - } - } - } + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; - pr = PROW( jb, grid ); + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); if ( myrow == pr ) { /* Block row ib in my process row */ - lib = LBi( jb, grid ); /* Local block number */ + lib = LBi( ib, grid ); /* Local block number */ ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; } } - - - for (lib=0;libnprow; /* not sure */ - if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; - for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; - for (j=0;jnpcol;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - - for (j=0;jnpcol;++j){ - if(ActiveFlag[j]!=3*nsupers){ - jb = ActiveFlag[j]; - pc = PCOL( jb, grid ); - if(jb==ib)Root=pc; - if(mycol==pc)Iactive=1; - } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; } - - quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); - - if(Iactive==1){ - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->npcol; ++j){ - if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; - ++rank_cnt; - } - } - if(rank_cnt>1){ + } + if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); - RdTree_SetTag(URtree_ptr[lib], RD_U,'d'); - // } - - // #if ( PRNTlevel>=1 ) - if(Root==mycol){ - // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); - // fflush(stdout); - assert(rank_cnt==brecv[lib]); - // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;jcomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'d'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); #endif - - //////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - - Llu->LRtree_ptr = LRtree_ptr; - Llu->LBtree_ptr = LBtree_ptr; - Llu->URtree_ptr = URtree_ptr; - Llu->UBtree_ptr = UBtree_ptr; - Llu->Linv_bc_ptr = Linv_bc_ptr; - Llu->Uinv_bc_ptr = Uinv_bc_ptr; - Llu->Urbs = Urbs; - Llu->Ucb_indptr = Ucb_indptr; - Llu->Ucb_valptr = Ucb_valptr; #if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", - nLblocks, nUblocks); + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); #endif - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(Llu->mod_bit = intMalloc_dist(k)) ) - ABORT("Malloc fails for mod_bit[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); #if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); #endif - } /* else fact != SamePattern_SameRowPerm */ + } /* else fact != SamePattern_SameRowPerm */ - if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ - SUPERLU_FREE(asub); - SUPERLU_FREE(a); - } - SUPERLU_FREE(xa); + if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ + SUPERLU_FREE(asub); + SUPERLU_FREE(a); + } + SUPERLU_FREE(xa); #if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit pddistribute()"); + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit pddistribute()"); #endif - - return (mem_use); - } /* PDDISTRIBUTE */ + + return (mem_use); +} /* PDDISTRIBUTE */ diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index 7d427829..3d076cc1 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -552,7 +552,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, Pslu_freeable_t Pslu_freeable; float flinfo; int blas_flag; - + /* Initialization. */ m = A->nrow; n = A->ncol; @@ -925,8 +925,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, #endif } - - /* ------------------------------------------------------------ Perform the LU factorization: symbolic factorization, redistribution, and numerical factorization. @@ -978,14 +976,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( permc_spec != MY_PERMC && Fact == DOFACT ) { /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */ if ( permc_spec == PARMETIS ) { - - // #pragma omp parallel // { // #pragma omp master // { - - /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * @@ -995,8 +989,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); // } - // } - + // } if (flinfo > 0) { #if ( PRNTlevel>=1 ) fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n"); @@ -1147,21 +1140,15 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); - - - - // #pragma omp parallel // { // #pragma omp master - // { - + // { pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; + stat->utime[FACT] = SuperLU_timer_() - t; + // } // } - // } - #if 0 // #ifdef GPU_PROF @@ -1354,8 +1341,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, fst_row, ldb, nrhs, SOLVEstruct, stat, info); // } // } - - + /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c index 9709ce4f..3ac15454 100644 --- a/SRC/pdgstrs.c +++ b/SRC/pdgstrs.c @@ -1,13 +1,13 @@ /*! \file - Copyright (c) 2003, The Regents of the University of California, through - Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from U.S. Dept. of Energy) +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) - All rights reserved. +All rights reserved. - The source code is distributed under BSD license, see the file License.txt - at the top-level directory. - */ +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ /*! @file @@ -22,7 +22,6 @@ */ #include #include "superlu_ddefs.h" - #ifndef CACHELINE #define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ #endif @@ -83,7 +82,7 @@ * | | | | | | * --------- <---------------| */ - + /*#define ISEND_IRECV*/ /* @@ -91,7 +90,7 @@ */ #ifdef _CRAY fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); + double*, int*, double*, int*); _fcd ftcs1; _fcd ftcs2; _fcd ftcs3; @@ -150,62 +149,60 @@ _fcd ftcs3; * */ - int_t +int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, - int_t fst_row, int_t *ilsum, double *x, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, - gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) + int_t fst_row, int_t *ilsum, double *x, + ScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, + gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *perm_r, *perm_c; /* row and column permutation vectors */ - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *xsup, *supno; - int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; - int p, procs; - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *perm_r, *perm_c; /* row and column permutation vectors */ + int_t *send_ibuf, *recv_ibuf; + double *send_dbuf, *recv_dbuf; + int_t *xsup, *supno; + int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; + int p, procs; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; MPI_Request req_i, req_d, *req_send, *req_recv; MPI_Status status, *status_send, *status_recv; int Nreq_recv, Nreq_send, pp; - #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()"); + CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()"); #endif - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - SendCnt = gstrs_comm->B_to_X_SendCnt; - SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; - RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; - sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; - rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - /* ------------------------------------------------------------ - NOW COMMUNICATE THE ACTUAL DATA. - ------------------------------------------------------------*/ - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + SendCnt = gstrs_comm->B_to_X_SendCnt; + SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; + RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; + sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; + rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + /* ------------------------------------------------------------ + NOW COMMUNICATE THE ACTUAL DATA. + ------------------------------------------------------------*/ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) ABORT("Malloc fails for req_send[]."); if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) @@ -214,38 +211,35 @@ pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, ABORT("Malloc fails for status_send[]."); if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) ABORT("Malloc fails for status_recv[]."); - - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls[p] * nrhs; + + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls[p] * nrhs; + } + + /* Copy the row indices and values to the send buffer. */ + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + k = ptr_to_ibuf[p]; + send_ibuf[k] = irow; + k = ptr_to_dbuf[p]; + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + send_dbuf[k++] = B[i + j*ldb]; } - - /* Copy the row indices and values to the send buffer. */ - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - k = ptr_to_ibuf[p]; - send_ibuf[k] = irow; - k = ptr_to_dbuf[p]; - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - send_dbuf[k++] = B[i + j*ldb]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - - -#if 1 - /* Communicate the (permuted) row indices. */ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - - /* Communicate the numerical values. */ - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; + } +#if 1 + /* Communicate the (permuted) row indices. */ + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + + /* Communicate the numerical values. */ + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); #else /* Communicate the (permuted) row indices. */ @@ -259,91 +253,41 @@ pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, MPI_Wait(&req_i,&status); MPI_Wait(&req_d,&status); -#endif - - - - // MPI_Barrier( grid->comm ); - - - // Nreq_send=0; - // for (pp=0;pp0){ - // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; - // } - // } - - // Nreq_recv=0; - // for (pp=0;pp0){ - // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; - // } - // } - - // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); - // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); - - - // Nreq_send=0; - // for (pp=0;pp0){ - // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; - // } - // } - // Nreq_recv=0; - // for (pp=0;pp0){ - // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; - // } - // } - - // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); - // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); - - - - /* ------------------------------------------------------------ - Copy buffer into X on the diagonal processes. - ------------------------------------------------------------*/ - ii = 0; - for (p = 0; p < procs; ++p) { - jj = rdispls_nrhs[p]; - for (i = 0; i < RecvCnt[p]; ++i) { - /* Only the diagonal processes do this; the off-diagonal processes - have 0 RecvCnt. */ - irow = recv_ibuf[ii]; /* The permuted row index. */ - k = BlockNum( irow ); - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number. */ - l = X_BLK( lk ); - x[l - XK_H] = k; /* Block number prepended in the header. */ - irow = irow - FstBlockC(k); /* Relative row number in X-block */ - RHS_ITERATE(j) { - x[l + irow + j*knsupc] = recv_dbuf[jj++]; - } - ++ii; - } +#endif + /* ------------------------------------------------------------ + Copy buffer into X on the diagonal processes. + ------------------------------------------------------------*/ + ii = 0; + for (p = 0; p < procs; ++p) { + jj = rdispls_nrhs[p]; + for (i = 0; i < RecvCnt[p]; ++i) { + /* Only the diagonal processes do this; the off-diagonal processes + have 0 RecvCnt. */ + irow = recv_ibuf[ii]; /* The permuted row index. */ + k = BlockNum( irow ); + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number. */ + l = X_BLK( lk ); + x[l - XK_H] = k; /* Block number prepended in the header. */ + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = recv_dbuf[jj++]; + } + ++ii; } + } - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); SUPERLU_FREE(req_send); SUPERLU_FREE(req_recv); SUPERLU_FREE(status_send); SUPERLU_FREE(status_recv); - + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()"); + CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()"); #endif - return 0; + return 0; } /* pdReDistribute_B_to_X */ /*! \brief @@ -361,59 +305,59 @@ pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, * */ - int_t +int_t pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row, - int_t nrhs, double *x, int_t *ilsum, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) + int_t nrhs, double *x, int_t *ilsum, + ScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + SOLVEstruct_t *SOLVEstruct) { - int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; - int_t *xsup, *supno; - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - int iam, p, q, pkk, procs; - int_t num_diag_procs, *diag_procs; + int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; + int_t *xsup, *supno; + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + double *send_dbuf, *recv_dbuf; + int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int iam, p, q, pkk, procs; + int_t num_diag_procs, *diag_procs; MPI_Request req_i, req_d, *req_send, *req_recv; MPI_Status status, *status_send, *status_recv; int Nreq_recv, Nreq_send, pp; - + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()"); + CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()"); #endif - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; - iam = grid->iam; - procs = grid->nprow * grid->npcol; - - SendCnt = gstrs_comm->X_to_B_SendCnt; - SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; - RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; - sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; - rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + iam = grid->iam; + procs = grid->nprow * grid->npcol; + + SendCnt = gstrs_comm->X_to_B_SendCnt; + SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; + RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; + sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; + rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) ABORT("Malloc fails for req_send[]."); if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) @@ -421,58 +365,53 @@ pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row, if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) ABORT("Malloc fails for status_send[]."); if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) - ABORT("Malloc fails for status_recv[]."); - - + ABORT("Malloc fails for status_recv[]."); recv_dbuf = send_dbuf + k * nrhs; - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls_nrhs[p]; - } - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number */ - irow = FstBlockC( k ); - l = X_BLK( lk ); - for (i = 0; i < knsupc; ++i) { + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls_nrhs[p]; + } + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { #if 0 - ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ + ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ #else - ii = irow; + ii = irow; #endif - q = row_to_proc[ii]; - jj = ptr_to_ibuf[q]; - send_ibuf[jj] = ii; - jj = ptr_to_dbuf[q]; - RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ - send_dbuf[jj++] = x[l + i + j*knsupc]; - } - ++ptr_to_ibuf[q]; - ptr_to_dbuf[q] += nrhs; - ++irow; - } - } + q = row_to_proc[ii]; + jj = ptr_to_ibuf[q]; + send_ibuf[jj] = ii; + jj = ptr_to_dbuf[q]; + RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ + send_dbuf[jj++] = x[l + i + j*knsupc]; + } + ++ptr_to_ibuf[q]; + ptr_to_dbuf[q] += nrhs; + ++irow; } + } } - - /* ------------------------------------------------------------ - COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. - ------------------------------------------------------------*/ - + } + + /* ------------------------------------------------------------ + COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. + ------------------------------------------------------------*/ #if 1 - - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); #else MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); @@ -483,82 +422,33 @@ pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row, MPI_Wait(&req_i,&status); MPI_Wait(&req_d,&status); #endif - - // MPI_Barrier( grid->comm ); - // Nreq_send=0; - // for (pp=0;pp0){ - // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; - // } - // } - - // Nreq_recv=0; - // for (pp=0;pp0){ - // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; - // } - // } - - // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); - // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); - // // MPI_Barrier( grid->comm ); - - // Nreq_send=0; - // for (pp=0;pp0){ - // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; - // } - // } - // Nreq_recv=0; - // for (pp=0;pp0){ - // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; - // } - // } - - // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); - // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); - // // MPI_Barrier( grid->comm ); - - - - - - /* ------------------------------------------------------------ - COPY THE BUFFER INTO B. - ------------------------------------------------------------*/ - for (i = 0, k = 0; i < m_loc; ++i) { - irow = recv_ibuf[i]; - irow -= fst_row; /* Relative row number */ - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - B[irow + j*ldb] = recv_dbuf[k++]; - } + /* ------------------------------------------------------------ + COPY THE BUFFER INTO B. + ------------------------------------------------------------*/ + for (i = 0, k = 0; i < m_loc; ++i) { + irow = recv_ibuf[i]; + irow -= fst_row; /* Relative row number */ + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow + j*ldb] = recv_dbuf[k++]; } + } - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); SUPERLU_FREE(req_send); SUPERLU_FREE(req_recv); SUPERLU_FREE(status_send); - SUPERLU_FREE(status_recv); - + SUPERLU_FREE(status_recv); #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()"); + CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()"); #endif - return 0; + return 0; } /* pdReDistribute_X_to_B */ + void pdCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t *stat, int *info) { @@ -586,12 +476,13 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t int INFO; double t; + double one = 1.0; + double zero = 0.0; + #if ( PROFlevel>=1 ) t = SuperLU_timer_(); #endif - // printf("wocao \n"); - // fflush(stdout); if(grid->iam==0){ printf("computing inverse of diagonal blocks...\n"); fflush(stdout); @@ -636,16 +527,13 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t knsupc = SuperSize( k ); for (j=0 ; j @@ -746,23 +633,23 @@ pdCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t * */ - void +void pdgstrs(int_t n, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, - gridinfo_t *grid, double *B, - int_t m_loc, int_t fst_row, int_t ldb, int nrhs, - SOLVEstruct_t *SOLVEstruct, - SuperLUStat_t *stat, int *info) + ScalePermstruct_t *ScalePermstruct, + gridinfo_t *grid, double *B, + int_t m_loc, int_t fst_row, int_t ldb, int nrhs, + SOLVEstruct_t *SOLVEstruct, + SuperLUStat_t *stat, int *info) { - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double beta = 0.0; - double zero = 0.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *x; /* X component at step k. */ - /* NOTE: x and lsum are of same size. */ - double *lusup, *dest; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + double alpha = 1.0; + double beta = 0.0; + double zero = 0.0; + double *lsum; /* Local running sum of the updates to B-components */ + double *x; /* X component at step k. */ + /* NOTE: x and lsum are of same size. */ + double *lusup, *dest; double *recvbuf,*recvbuf_on, *tempv, *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin; double *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */ double *Linv; /* Inverse of diagonal block */ @@ -873,8 +760,6 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, aln_d = ceil(CACHELINE/(double)dword); aln_i = ceil(CACHELINE/(double)iword); - - int num_thread = 1; #ifdef _OPENMP #pragma omp parallel default(shared) @@ -893,84 +778,82 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, TIC(t1_sol); t = SuperLU_timer_(); - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerr_dist("PDGSTRS", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PDGSTRS", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Linv_bc_ptr = Llu->Linv_bc_ptr; - Uinv_bc_ptr = Llu->Uinv_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ stat->utime[SOL_COMM] = 0.0; stat->utime[SOL_GEMM] = 0.0; stat->utime[SOL_TRSM] = 0.0; - stat->utime[SOL_L] = 0.0; - - + stat->utime[SOL_L] = 0.0; + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs()"); + CHECK_MALLOC(iam, "Enter pdgstrs()"); #endif - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + if ( !(frecv = intCalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; - if ( !(frecv = intCalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; - - if ( !(leaf_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) ) + if ( !(leaf_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) ) ABORT("Malloc fails for leaf_send[]."); nleaf_send=0; - if ( !(root_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) ) ABORT("Malloc fails for root_send[]."); - nroot_send=0; - + nroot_send=0; #ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); #endif - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H); sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d; + + #ifdef _OPENMP - if ( !(lsum = doubleMalloc_dist(sizelsum*num_thread))) - ABORT("Calloc fails for lsum[]."); + if ( !(lsum = (double*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(double)))) + ABORT("Malloc fails for lsum[]."); #pragma omp parallel default(shared) private(thread_id,ii) { thread_id = omp_get_thread_num (); @@ -978,29 +861,30 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, lsum[thread_id*sizelsum+ii]=0; } #else - if ( !(lsum = doubleCalloc_dist(sizelsum*num_thread))) - ABORT("Calloc fails for lsum[]."); + if ( !(lsum = (double*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(double)))) + ABORT("Malloc fails for lsum[]."); + for(ii=0;ii=1 ) t = SuperLU_timer_() - t; if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t); fflush(stdout); t = SuperLU_timer_(); -#endif +#endif - /* Set up the headers in lsum[]. */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ - } - ii += knsupc; + /* Set up the headers in lsum[]. */ + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ } - - /* --------------------------------------------------------- - Precompute mapping from Lrowind_bc_ptr to lsum. - --------------------------------------------------------- */ - - - - // nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - // if ( !(Llu->Lrowind_bc_2_lsum = - // (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) - // ABORT("Malloc fails for Lrowind_bc_2_lsum[]."); - - - // for (ljb = 0; ljb < nsupers_j; ++ljb) { - - // if(Lrowind_bc_ptr[ljb]!=NULL){ - - // jb = mycol+ljb*grid->npcol; - - // knsupc = SuperSize( jb ); - // krow = PROW( jb, grid ); - // nrbl = Lrowind_bc_ptr[ljb][0]; - - // if(myrow==krow){ /* skip the diagonal block */ - // nlb_nodiag=nrbl-1; - // idx_i = nlb_nodiag+2; - // m = Lrowind_bc_ptr[ljb][1]-knsupc; - // }else{ - // nlb_nodiag=nrbl; - // idx_i = nlb_nodiag; - // m = Lrowind_bc_ptr[ljb][1]; - // } - - // if(nlb_nodiag>0){ - // if ( !(Llu->Lrowind_bc_2_lsum[ljb] = intMalloc_dist(((m*nrhs + (aln_i - 1)) / aln_i) * aln_i)) ) - // ABORT("Malloc fails for Lrowind_bc_2_lsum[ljb][]."); - // idx_r=0; - // RHS_ITERATE(j) - // for (lb = 0; lb < nlb_nodiag; ++lb) { - // lptr1_tmp = Llu->Lindval_loc_bc_ptr[ljb][lb+idx_i]; - // ik = Lrowind_bc_ptr[ljb][lptr1_tmp]; /* Global block number, row-wise. */ - // iknsupc = SuperSize( ik ); - // nbrow = Lrowind_bc_ptr[ljb][lptr1_tmp+1]; - // lk = LBi( ik, grid ); /* Local block number, row-wise. */ - // il = LSUM_BLK( lk ); - // rel = xsup[ik]; /* Global row index of block ik. */ - // for (ii = 0; ii < nbrow; ++ii) { - // irow = Lrowind_bc_ptr[ljb][lptr1_tmp+LB_DESCRIPTOR+ii] - rel; /* Relative row. */ - // Llu->Lrowind_bc_2_lsum[ljb][idx_r++] = il+irow+ j*iknsupc; - // } - // } - // }else{ - // Llu->Lrowind_bc_2_lsum[ljb]=NULL; - // } - // }else{ - // Llu->Lrowind_bc_2_lsum[ljb]=NULL; - // } - // } + ii += knsupc; + } /* --------------------------------------------------------- Initialize the async Bcast trees on all processes. @@ -1140,13 +966,12 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, } } - for (i = 0; i < nlb; ++i) fmod[i] += frecv[i]; - if ( !(recvbuf_BC_fwd = doubleMalloc_dist(maxrecvsz*(nfrecvx+1))) ) // this needs to be optimized for 1D row mapping + if ( !(recvbuf_BC_fwd = (double*)SUPERLU_MALLOC(maxrecvsz*(nfrecvx+1) * sizeof(double))) ) // this needs to be optimized for 1D row mapping ABORT("Malloc fails for recvbuf_BC_fwd[]."); nfrecvx_buf=0; - + #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n, nbtree %4d\n, nrtree %4d\n", iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree); @@ -1254,6 +1079,11 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, lusup, &nsupr, &x[ii], &knsupc); #endif } + + // for (i=0 ; i=1 ) @@ -1263,6 +1093,8 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, #endif stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + + // --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); @@ -1412,8 +1244,9 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, #endif { - + k = *recvbuf0; + #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif @@ -1467,6 +1300,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } // #ifdef _OPENMP @@ -1482,7 +1316,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, knsupc = SuperSize( k ); for (ii=1;ii=1 ) TOC(t2, t1); stat_loc[thread_id]->utime[SOL_TRSM] += t2; - #endif - - stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); @@ -1579,7 +1409,6 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, for (ii=1;iiLrowind_bc_2_lsum[ljb]!=NULL) - // SUPERLU_FREE(Llu->Lrowind_bc_2_lsum[ljb]); - // SUPERLU_FREE(Llu->Lrowind_bc_2_lsum); - - for (lk=0;lkcomm ); - #if ( VAMPIR>=1 ) VT_traceoff(); VT_finalize(); @@ -1688,9 +1510,7 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, Llu->brecv = brecv; k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - // if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - // ABORT("Malloc fails for send_req[]."); - + /* Re-initialize lsum to zero. Each block header is already in place. */ #ifdef _OPENMP @@ -1730,92 +1550,6 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, } } #endif - - - - // /* Set up additional pointers for the index and value arrays of U. - // nub is the number of local block columns. */ - // nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - // if ( !(Urbs = (int_t *) intCalloc_dist(3*nub)) ) - // ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - // blocks in a block column. */ - // Urbs1 = Urbs + nub; - // Urbs2 = Urbs + nub*2; - // if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - // ABORT("Malloc fails for Ucb_indptr[]"); - // if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - // ABORT("Malloc fails for Ucb_valptr[]"); - - // /* Count number of row blocks in a block column. - // One pass of the skeleton graph of U. */ - // for (lk = 0; lk < nlb; ++lk) { - // usub = Ufstnz_br_ptr[lk]; - // if ( usub ) { /* Not an empty block row. */ - // /* usub[0] -- number of column blocks in this block row. */ -// #if ( DEBUGlevel>=2 ) - // Ublocks += usub[0]; -// #endif - // i = BR_HEADER; /* Pointer in index array. */ - // for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - // k = usub[i]; /* Global block number */ - // ++Urbs[LBj(k,grid)]; - // i += UB_DESCRIPTOR + SuperSize( k ); - // } - // } - // } - - // /* Set up the vertical linked lists for the row blocks. - // One pass of the skeleton graph of U. */ - // for (lb = 0; lb < nub; ++lb) { - // if ( Urbs[lb] ) { /* Not an empty block column. */ - // if ( !(Ucb_indptr[lb] - // = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - // ABORT("Malloc fails for Ucb_indptr[lb][]"); - // if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - // ABORT("Malloc fails for Ucb_valptr[lb][]"); - // } - // } - // for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - // usub = Ufstnz_br_ptr[lk]; - // if ( usub ) { /* Not an empty block row. */ - // i = BR_HEADER; /* Pointer in index array. */ - // j = 0; /* Pointer in nzval array. */ - - // // gik = lk * grid->nprow + myrow;/* Global block number, row-wise. */ - // // iklrow = FstBlockC( gik+1 ); - - // for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - // k = usub[i]; /* Global block number, column-wise. */ - // ljb = LBj( k, grid ); /* Local block number, column-wise. */ - // Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - - - - - // Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - // Ucb_valptr[ljb][Urbs1[ljb]] = j; - - // // knsupc = SuperSize( k ); - // // nbrow = 0; - // // for (jj = 0; jj < knsupc; ++jj) { - // // fnz = usub[i +UB_DESCRIPTOR+ jj]; - // // if ( fnz < iklrow ) { - // // if(nbrow=2 ) for (p = 0; p < Pr*Pc; ++p) { @@ -1903,8 +1637,8 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, for (i = 0; i < nlb; ++i) bmod[i] += brecv[i]; // for (i = 0; i < nlb; ++i)printf("bmod[i]: %5d\n",bmod[i]); - - if ( !(recvbuf_BC_fwd = doubleMalloc_dist(maxrecvsz*(nbrecvx+1))) ) // this needs to be optimized for 1D row mapping + + if ( !(recvbuf_BC_fwd = (double*)SUPERLU_MALLOC(maxrecvsz*(nbrecvx+1) * sizeof(double))) ) // this needs to be optimized for 1D row mapping ABORT("Malloc fails for recvbuf_BC_fwd[]."); nbrecvx_buf=0; @@ -2001,7 +1735,11 @@ pdgstrs(int_t n, LUstruct_t *LUstruct, dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif - } + } + // for (i=0 ; iutime[SOL_TRSM] += t2; #endif stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif @@ -2226,7 +1966,7 @@ for (i=0;icomm ); - - - - /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ - // for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); - // SUPERLU_FREE(send_req); - - // MPI_Barrier( grid->comm ); - #if ( PROFlevel>=2 ) { diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c index ba7577bc..73d41b6f 100644 --- a/SRC/pdgstrs_lsum.c +++ b/SRC/pdgstrs_lsum.c @@ -1,13 +1,13 @@ /*! \file - Copyright (c) 2003, The Regents of the University of California, through - Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from U.S. Dept. of Energy) +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) - All rights reserved. +All rights reserved. - The source code is distributed under BSD license, see the file License.txt - at the top-level directory. - */ +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ /*! @file @@ -34,20 +34,14 @@ */ #ifdef _CRAY fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); + double*, int*, double*, int*); fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); + int*, double*, int*, double*, double*, int*); _fcd ftcs1; _fcd ftcs2; _fcd ftcs3; #endif - -// #ifndef CACHELINE -// #define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ -// #endif - - /************************************************************************/ /*! \brief * @@ -76,181 +70,167 @@ void dlsum_fmod LocalLU_t *Llu, MPI_Request send_req[], /* input/output */ SuperLUStat_t *stat - ) +) { - double alpha = 1.0, beta = 0.0; - double *lusup, *lusup1; - double *dest; - int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; - int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; - int_t *lsub, *lsub1, nlb1, lptr1, luptr1; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; - MPI_Status status; - int test_flag; + double alpha = 1.0, beta = 0.0; + double *lusup, *lusup1; + double *dest; + int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; + int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + MPI_Status status; + int test_flag; #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; #endif - - #if ( PROFlevel>=1 ) TIC(t1); #endif - - iam = grid->iam; - myrow = MYROW( iam, grid ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Llu->Lrowind_bc_ptr[lk]; - lusup = Llu->Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; - - for (lb = 0; lb < nlb; ++lb) { - ik = lsub[lptr]; /* Global block number, row-wise. */ - nbrow = lsub[lptr+1]; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + lusup = Llu->Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + for (lb = 0; lb < nlb; ++lb) { + ik = lsub[lptr]; /* Global block number, row-wise. */ + nbrow = lsub[lptr+1]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow, 1, 1 ); + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow, 1, 1 ); #else - dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr], &nsupr, xk, - &knsupc, &beta, rtemp, &nbrow ); + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); #endif - stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; - - lk = LBi( ik, grid ); /* Local block number, row-wise. */ - iknsupc = SuperSize( ik ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - lptr += LB_DESCRIPTOR; - rel = xsup[ik]; /* Global row index of block ik. */ - for (i = 0; i < nbrow; ++i) { - irow = lsub[lptr++] - rel; /* Relative row. */ - RHS_ITERATE(j) - dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; - } - luptr += nbrow; - - + stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + lptr += LB_DESCRIPTOR; + rel = xsup[ik]; /* Global row index of block ik. */ + for (i = 0; i < nbrow; ++i) { + irow = lsub[lptr++] - rel; /* Relative row. */ + RHS_ITERATE(j) + dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; + } + luptr += nbrow; #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[SOL_GEMM] += t2; - -#endif - - - - - if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - ikcol = PCOL( ik, grid ); - p = PNUM( myrow, ikcol, grid ); - if ( iam != p ) { +#endif + + if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ + ikcol = PCOL( ik, grid ); + p = PNUM( myrow, ikcol, grid ); + if ( iam != p ) { #ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); #else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( lk ); - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; - if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ - fmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( ik, grid );/* Local block number, column-wise. */ - lsub1 = Llu->Lrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; - - + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( lk ); + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; #if ( PROFlevel>=1 ) - TIC(t1); + TIC(t1); #endif - #ifdef _CRAY - STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); #elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); + dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); #else - dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - lusup1, &nsupr1, &x[ii], &iknsupc); + dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); #endif - - #if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[SOL_TRSM] += t2; - + TOC(t2, t1); + stat->utime[SOL_TRSM] += t2; #endif - - stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( fsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, ikcol, grid ); + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, ikcol, grid ); #ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); #else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); #endif - } - } - /* - * Perform local block modifications. - */ - nlb1 = lsub1[0] - 1; - lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - fmod, nlb1, lptr1, luptr1, xsup, - grid, Llu, send_req, stat); - } /* if frecv[lk] == 0 */ - } /* if iam == p */ - } /* if fmod[lk] == 0 */ - - } /* for lb ... */ + } + } + /* + * Perform local block modifications. + */ + nlb1 = lsub1[0] - 1; + lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; + luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ + + dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, lptr1, luptr1, xsup, + grid, Llu, send_req, stat); + } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + + } /* for lb ... */ + } /* dLSUM_FMOD */ @@ -274,147 +254,148 @@ void dlsum_bmod SuperLUStat_t *stat ) { - /* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= U_i,k * X[k]. - */ - double alpha = 1.0, beta = 0.0; - int iam, iknsupc, knsupc, myrow, nsupr, p, pi; - int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, - j, jj, lk, lk1, nub, ub, uptr; - int_t *usub; - double *uval, *dest, *y; - int_t *lsub; - double *lusup; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *brecv = Llu->brecv; - int_t **bsendx_plist = Llu->bsendx_plist; - MPI_Status status; - int test_flag; - - iam = grid->iam; - myrow = MYROW( iam, grid ); - knsupc = SuperSize( k ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - for (ub = 0; ub < nub; ++ub) { - ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ - usub = Llu->Ufstnz_br_ptr[ik]; - uval = Llu->Unzval_br_ptr[ik]; - i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ - i += UB_DESCRIPTOR; - il = LSUM_BLK( ik ); - gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ - iknsupc = SuperSize( gik ); - ikfrow = FstBlockC( gik ); - iklrow = FstBlockC( gik+1 ); - - RHS_ITERATE(j) { - dest = &lsum[il + j*iknsupc]; - y = &xk[j*knsupc]; - uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ - for (jj = 0; jj < knsupc; ++jj) { - fnz = usub[i + jj]; - if ( fnz < iklrow ) { /* Nonzero segment. */ - /* AXPY */ - for (irow = fnz; irow < iklrow; ++irow) - dest[irow - ikfrow] -= uval[uptr++] * y[jj]; - stat->ops[SOLVE] += 2 * (iklrow - fnz); - } - } /* for jj ... */ +/* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + double alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + double *uval, *dest, *y; + int_t *lsub; + double *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + MPI_Status status; + int test_flag; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat->ops[SOLVE] += 2 * (iklrow - fnz); } + } /* for jj ... */ + } - if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ - gikcol = PCOL( gik, grid ); - p = PNUM( myrow, gikcol, grid ); - if ( iam != p ) { + if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { #ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); #else - MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( ik ); - dest = &x[ii]; - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; - if ( !brecv[ik] ) { /* Becomes a leaf node. */ - bmod[ik] = -1; /* Do not solve X[k] in the future. */ - lk1 = LBj( gik, grid ); /* Local block number. */ - lsub = Llu->Lrowind_bc_ptr[lk1]; - lusup = Llu->Lnzval_bc_ptr[lk1]; - nsupr = lsub[1]; + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( ik ); + dest = &x[ii]; + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( !brecv[ik] ) { /* Becomes a leaf node. */ + bmod[ik] = -1; /* Do not solve X[k] in the future. */ + lk1 = LBj( gik, grid ); /* Local block number. */ + lsub = Llu->Lrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; #ifdef _CRAY - STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); #elif defined (USE_VENDOR_BLAS) - dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); + dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); #else - dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - lusup, &nsupr, &x[ii], &iknsupc); + dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); #endif - stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, gik); + printf("(%2d) Solve X[%2d]\n", iam, gik); #endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( bsendx_plist[lk1][p] != EMPTY ) { - pi = PNUM( p, gikcol, grid ); + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( bsendx_plist[lk1][p] != EMPTY ) { + pi = PNUM( p, gikcol, grid ); #ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); #else #ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); #else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); #endif #endif #if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); #endif - } - } - /* - * Perform local block modifications. - */ - if ( Urbs[lk1] ) - dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if brecv[ik] == 0 */ } - } /* if bmod[ik] == 0 */ - - } /* for ub ... */ + } + /* + * Perform local block modifications. + */ + if ( Urbs[lk1] ) + dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ } /* dlSUM_BMOD */ + /************************************************************************/ /*! \brief * @@ -445,23 +426,23 @@ void dlsum_fmod_inv int_t sizelsum, int_t sizertemp, int_t recurlevel - ) +) { - double alpha = 1.0, beta = 0.0,malpha=-1.0; - double *lusup, *lusup1; - double *dest; - double *Linv;/* Inverse of diagonal block */ + double alpha = 1.0, beta = 0.0,malpha=-1.0; + double *lusup, *lusup1; + double *dest; + double *Linv;/* Inverse of diagonal block */ int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m; int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; int thread_id,thread_id1,num_thread; - flops_t ops_loc=0.0; - MPI_Status status; - int test_flag; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; yes_no_t done; BcTree *LBtree_ptr = Llu->LBtree_ptr; RdTree *LRtree_ptr = Llu->LRtree_ptr; @@ -474,7 +455,6 @@ void dlsum_fmod_inv int_t luptr; /* Starting position in lusup[*]. */ maxsuper = sp_ienv_dist(3); - #ifdef _OPENMP thread_id = omp_get_thread_num (); num_thread = omp_get_num_threads (); @@ -491,7 +471,6 @@ void dlsum_fmod_inv float msg_vol = 0, msg_cnt = 0; // #endif - if(nlb>0){ maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); @@ -571,20 +550,20 @@ void dlsum_fmod_inv lptr1_tmp = lloc[lb+idx_i]; nbrow += lsub[lptr1_tmp+1]; } - -#ifdef _CRAY + + #ifdef _CRAY SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow ); -#elif defined (USE_VENDOR_BLAS) + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); -#else + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow ); -#endif + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif nbrow_ref=0; for (lb = lbstart; lb < lbend; ++lb){ @@ -593,8 +572,7 @@ void dlsum_fmod_inv nbrow1 = lsub[lptr1_tmp+1]; ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ rel = xsup[ik]; /* Global row index of block ik. */ - - + lk = LBi( ik, grid ); /* Local block number, row-wise. */ iknsupc = SuperSize( ik ); @@ -603,7 +581,6 @@ void dlsum_fmod_inv RHS_ITERATE(j) for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; @@ -616,7 +593,6 @@ void dlsum_fmod_inv for (lb=lbstart;lbinv == 1){ Linv = Llu->Linv_bc_ptr[lk]; + + #ifdef _CRAY SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, &alpha, Linv, &iknsupc, &x[ii], @@ -684,10 +661,11 @@ void dlsum_fmod_inv dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, &alpha, Linv, &iknsupc, &x[ii], &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif +#endif for (i=0 ; i=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_TRSM] += t2; #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); + #endif /* @@ -745,7 +731,7 @@ void dlsum_fmod_inv } } - } + } }else{ @@ -766,10 +752,6 @@ void dlsum_fmod_inv &alpha, &lusup[luptr_tmp], &nsupr, xk, &knsupc, &beta, rtemp_loc, &m ); #endif - - // for (i = 0; i < m*nrhs; ++i) { - // lsum[idx_lsum[i]+sizelsum*thread_id] -=rtemp_loc[i]; - // } nbrow=0; for (lb = 0; lb < nlb; ++lb){ @@ -793,19 +775,16 @@ void dlsum_fmod_inv for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; } - - // TOC(t3, t1); #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; - #endif thread_id1 = omp_get_thread_num (); @@ -834,8 +813,8 @@ void dlsum_fmod_inv for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_TRSM] += t2; - #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); #endif @@ -944,18 +928,14 @@ void dlsum_fmod_inv } /* if fmod[lk] == 0 */ } // } - } + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; -stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + } /* if nlb>0*/ -} /* dLSUM_FMOD_inv */ - - - - +} /* dLSUM_FMOD_INV */ /************************************************************************/ /*! \brief @@ -985,24 +965,24 @@ void dlsum_fmod_inv_master int_t sizelsum, int_t sizertemp, int_t recurlevel - ) +) { - double alpha = 1.0, beta = 0.0,malpha=-1.0; - double *lusup, *lusup1; - double *dest; - double *Linv;/* Inverse of diagonal block */ + double alpha = 1.0, beta = 0.0,malpha=-1.0; + double *lusup, *lusup1; + double *dest; + double *Linv;/* Inverse of diagonal block */ int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r; int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; - int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; int thread_id,thread_id1,num_thread; - int m; - flops_t ops_loc=0.0; - MPI_Status status; - int test_flag; + int m; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; yes_no_t done; BcTree *LBtree_ptr = Llu->LBtree_ptr; RdTree *LRtree_ptr = Llu->LRtree_ptr; @@ -1010,12 +990,11 @@ void dlsum_fmod_inv_master double *rtemp_loc; int_t ldalsum,maxsuper,aln_d; int dword = sizeof (double); + int_t nleaf_send_tmp; int_t lptr; /* Starting position in lsub[*]. */ int_t luptr; /* Starting position in lusup[*]. */ maxsuper = sp_ienv_dist(3); - - #ifdef _OPENMP thread_id = omp_get_thread_num (); num_thread = omp_get_num_threads (); @@ -1027,29 +1006,34 @@ void dlsum_fmod_inv_master rtemp_loc = &rtemp[sizertemp* thread_id]; - // #if ( PROFlevel>=1 ) double t1, t2, t3, t4; float msg_vol = 0, msg_cnt = 0; // #endif - if(nlb>0){ - maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); iam = grid->iam; myrow = MYROW( iam, grid ); lk = LBj( k, grid ); /* Local block number, column-wise. */ + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + lsub = Llu->Lrowind_bc_ptr[lk]; + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + lusup = Llu->Lnzval_bc_ptr[lk]; lloc = Llu->Lindval_loc_bc_ptr[lk]; // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; nsupr = lsub[1]; + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); krow = PROW( k, grid ); if(myrow==krow){ @@ -1067,21 +1051,16 @@ void dlsum_fmod_inv_master } assert(m>0); - + if(m>4*maxsuper || nrhs>10){ // if(m<1){ - - - // TIC(t1); Nchunk=num_thread; nlb_loc = floor(((double)nlb)/Nchunk); remainder = nlb % Nchunk; - - #ifdef _OPENMP -#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied #endif for (nn=0;nn=1 ) TIC(t1); #endif - luptr_tmp1 = lloc[lbstart+idx_v]; nbrow=0; for (lb = lbstart; lb < lbend; ++lb){ lptr1_tmp = lloc[lb+idx_i]; nbrow += lsub[lptr1_tmp+1]; } - - -#ifdef _CRAY + + #ifdef _CRAY SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow ); -#elif defined (USE_VENDOR_BLAS) + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); -#else + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp1], &nsupr, xk, - &knsupc, &beta, rtemp_loc, &nbrow ); -#endif - - + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif nbrow_ref=0; for (lb = lbstart; lb < lbend; ++lb){ @@ -1140,8 +1112,7 @@ void dlsum_fmod_inv_master nbrow1 = lsub[lptr1_tmp+1]; ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ rel = xsup[ik]; /* Global row index of block ik. */ - - + lk = LBi( ik, grid ); /* Local block number, row-wise. */ iknsupc = SuperSize( ik ); @@ -1150,7 +1121,6 @@ void dlsum_fmod_inv_master RHS_ITERATE(j) for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; @@ -1160,8 +1130,9 @@ void dlsum_fmod_inv_master TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } + } + }else{ #if ( PROFlevel>=1 ) @@ -1181,16 +1152,12 @@ void dlsum_fmod_inv_master &alpha, &lusup[luptr_tmp], &nsupr, xk, &knsupc, &beta, rtemp_loc, &m ); #endif - - // for (i = 0; i < m*nrhs; ++i) { - // lsum[idx_lsum[i]] -=rtemp_loc[i]; - // } nbrow=0; for (lb = 0; lb < nlb; ++lb){ lptr1_tmp = lloc[lb+idx_i]; nbrow += lsub[lptr1_tmp+1]; - } + } nbrow_ref=0; for (lb = 0; lb < nlb; ++lb){ lptr1_tmp = lloc[lb+idx_i]; @@ -1208,23 +1175,16 @@ void dlsum_fmod_inv_master for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; - } - + } #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; - #endif - - } - - // TOC(t3, t1); - - - + } + // TOC(t3, t1); thread_id1 = omp_get_thread_num (); @@ -1266,8 +1226,7 @@ void dlsum_fmod_inv_master for (ii=1;iiLnzval_bc_ptr[lk]; nsupr1 = lsub1[1]; - - - if(Llu->inv == 1){ Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY @@ -1319,7 +1274,7 @@ void dlsum_fmod_inv_master &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif for (i=0 ; i=1 ) TOC(t2, t1); @@ -1340,8 +1299,8 @@ void dlsum_fmod_inv_master #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); #endif @@ -1374,13 +1333,9 @@ void dlsum_fmod_inv_master } /* if fmod[lk] == 0 */ } // } - - - stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; } /* if nlb>0*/ -} /* dlsum_fmod_inv_master */ - +} /* dLSUM_FMOD_INV */ @@ -1415,7 +1370,7 @@ void dlsum_bmod_inv * ======= * Perform local block modifications: lsum[i] -= U_i,k * X[k]. */ - double alpha = 1.0, beta = 0.0; + double alpha = 1.0, beta = 0.0; int iam, iknsupc, knsupc, myrow, nsupr, p, pi; int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, j, jj, lk, lk1, nub, ub, uptr; @@ -1435,7 +1390,7 @@ void dlsum_bmod_inv double *rtemp_loc; int_t nroot_send_tmp; double *Uinv;/* Inverse of diagonal block */ - + double temp; double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; @@ -1455,11 +1410,6 @@ void dlsum_bmod_inv knsupc = SuperSize( k ); lk = LBj( k, grid ); /* Local block number, column-wise. */ nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - - - // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); - // fflush(stdout); if(nub>num_thread){ // // // // if(Urbs2[lk]>num_thread){ @@ -1469,7 +1419,7 @@ void dlsum_bmod_inv remainder = nub % Nchunk; #ifdef _OPENMP -#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup #endif for (nn=0;nnops[SOLVE] += 2 * (iklrow - fnz); + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } } /* for jj ... */ } @@ -1536,8 +1487,8 @@ void dlsum_bmod_inv for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_TRSM] += t2; - #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, gik); @@ -1679,8 +1635,9 @@ void dlsum_bmod_inv if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ for (irow = fnz; irow < iklrow; ++irow) - dest[irow - ikfrow] -= uval[uptr++] * y[jj]; - stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); } } /* for jj ... */ } @@ -1701,9 +1658,8 @@ void dlsum_bmod_inv if ( iam != p ) { for (ii=1;iiutime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; - + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, gik); #endif @@ -1826,10 +1782,6 @@ void dlsum_bmod_inv - - - - /************************************************************************/ void dlsum_bmod_inv_master /************************************************************************/ @@ -1859,7 +1811,7 @@ void dlsum_bmod_inv_master * ======= * Perform local block modifications: lsum[i] -= U_i,k * X[k]. */ - double alpha = 1.0, beta = 0.0; + double alpha = 1.0, beta = 0.0; int iam, iknsupc, knsupc, myrow, nsupr, p, pi; int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, j, jj, lk, lk1, nub, ub, uptr; @@ -1877,7 +1829,7 @@ void dlsum_bmod_inv_master int_t bmod_tmp; int thread_id,thread_id1,num_thread; double *rtemp_loc; - + double temp; double *Uinv;/* Inverse of diagonal block */ double t1, t2; @@ -1912,7 +1864,7 @@ void dlsum_bmod_inv_master remainder = nub % Nchunk; #ifdef _OPENMP -#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied #endif for (nn=0;nnops[SOLVE] += 2 * (iklrow - fnz); + } } /* for jj ... */ } @@ -2000,6 +1953,7 @@ void dlsum_bmod_inv_master for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } } /* for jj ... */ } @@ -2036,7 +1990,7 @@ void dlsum_bmod_inv_master for (ii=1;ii=2 ) @@ -2051,7 +2005,7 @@ void dlsum_bmod_inv_master for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_TRSM] += t2; -#endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; - +#endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, gik); #endif @@ -2138,7 +2092,3 @@ void dlsum_bmod_inv_master } } /* dlsum_bmod_inv_master */ - - - - diff --git a/SRC/pdsymbfact_distdata.c b/SRC/pdsymbfact_distdata.c index 436a611c..8fb34757 100644 --- a/SRC/pdsymbfact_distdata.c +++ b/SRC/pdsymbfact_distdata.c @@ -1196,10 +1196,10 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t Glu_freeable_n; LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb,ib, jj, k, k1, + int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, len, len1, nsupc, nsupc_gb, ii, nprocs; int_t lib; /* local block row number */ - int_t nlb; /* local block rows*/ + int_t nlb; /* local block rows*/ int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ @@ -1224,11 +1224,11 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t *index_srt; /* indices consist of headers and row subscripts */ - double *lusup_srt; /* nonzero values in L and U */ + double *lusup_srt; /* nonzero values in L and U */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ @@ -1241,7 +1241,8 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - + + /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; @@ -1268,15 +1269,6 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *LUb_number; /* global block number; size nsupers_ij */ int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - - - float memStrLU, memA, - memDist = 0.; /* memory used for redistributing the data, which does - not include the memory for the numerical values - of L and U (positive number)*/ - float memNLU = 0.; /* memory allocated for storing the numerical values of - L and U, that will be used in the numeric - factorization (positive number) */ int_t *ActiveFlag; int_t *ActiveFlagAll; int_t Iactive; @@ -1285,7 +1277,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t **nzrows; double rseed; int rank_cnt,rank_cnt_ref,Root; - double *dense, *dense_col; /* SPA */ +double *dense, *dense_col; /* SPA */ double zero = 0.0; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; @@ -1298,7 +1290,15 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t ik, il, lk, rel, knsupc, idx_r; int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; int_t nub; - + + float memStrLU, memA, + memDist = 0.; /* memory used for redistributing the data, which does + not include the memory for the numerical values + of L and U (positive number)*/ + float memNLU = 0.; /* memory allocated for storing the numerical values of + L and U, that will be used in the numeric + factorization (positive number) */ + #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif @@ -1321,7 +1321,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, iword = sizeof(int_t); dword = sizeof(double); - aln_i = ceil(CACHELINE/(double)iword); + aln_i = ceil(CACHELINE/(double)iword); if (fact == SamePattern_SameRowPerm) { ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); @@ -1487,32 +1487,31 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); return (memDist + memNLU); } - if ( !(Linv_bc_ptr = - (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Uinv_bc_ptr = - (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); - return (memDist + memNLU); - } - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); return (memDist + memNLU); } - + + if ( !(Linv_bc_ptr = + (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + return (memDist + memNLU); + } + if ( !(Uinv_bc_ptr = + (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + return (memDist + memNLU); + } if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){ fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[]."); return (memDist + memNLU); - } + } memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*); Lnzval_bc_ptr[nsupers_j-1] = NULL; + Lrowind_bc_ptr[nsupers_j-1] = NULL; Linv_bc_ptr[nsupers_j-1] = NULL; Uinv_bc_ptr[nsupers_j-1] = NULL; - Lrowind_bc_ptr[nsupers_j-1] = NULL; Lindval_loc_bc_ptr[nsupers_j-1] = NULL; /* These lists of processes will be used for triangular solves. */ @@ -1801,24 +1800,17 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb); return (memDist + memNLU); } - - if (!(Linv_bc_ptr[ljb_j] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[*][] col block " IFMT, jb); - return (memDist + memNLU); - } - if (!(Uinv_bc_ptr[ljb_j] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[*][] col block " IFMT, jb); - return (memDist + memNLU); - } - + + if (!(Linv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]"); + if (!(Uinv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]"); + memNLU += len1*iword + len*nsupc*dword; if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]"); - - + lusup = Lnzval_bc_ptr[ljb_j]; mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); @@ -1836,7 +1828,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Lindval_loc_bc_ptr[ljb_j][k] = lb; Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind; Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val; - + LUb_length[lb] = 0; index[next_ind++] = gb; /* Descriptor */ index[next_ind++] = len; @@ -1866,6 +1858,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, } } /* for i ... */ + /* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/ if(nrbl>1){ @@ -1883,9 +1876,9 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, if ( !(index_srt = intMalloc_dist(len1)) ) ABORT("Malloc fails for index_srt[]"); - if (!(lusup_srt = doubleMalloc_dist(len*nsupc))) + if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double)))) ABORT("Malloc fails for lusup_srt[]"); - + idx_indx = BC_HEADER; idx_lusup = 0; for (jj=0;jjnpcol); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub1 = Ufstnz_br_ptr[lk]; - if ( usub1 ) { /* Not an empty block row. */ - /* usub1[0] -- number of column blocks in this block row. */ - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ - k = usub1[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); } } + } - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub1 = Ufstnz_br_ptr[lk]; - if ( usub1 ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - - for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ - k = usub1[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - - ++Urbs1[ljb]; - j += usub1[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); } - } - - - - - ///////////////////////////////////////////////////////////////// + } + } + + + + + ///////////////////////////////////////////////////////////////// - // if(LSUM=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Bcast tree for L ... */ - - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - ABORT("Malloc fails for LBtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); - if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_BC[]."); - - for (i=0;icscp.comm); + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); - if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) ) - ABORT("Calloc fails for ActiveFlag[]."); - for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; - for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); - } /* for j ... */ - } - } + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); - + istart = xlsub[ljb]; + for (i = istart; i < xlsub[ljb+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + ActiveFlagAll[pr+ljb*grid->nprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } - for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; - for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; - for (j=0;jnprow;++j)ranks[j]=-1; - - Root=-1; - Iactive = 0; - for (j=0;jnprow;++j){ - if(ActiveFlag[j]!=3*nsupers){ - gb = ActiveFlag[j]; - pr = PROW( gb, grid ); - if(gb==jb)Root=pr; - if(myrow==pr)Iactive=1; - } - } - - - quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); - - if(Iactive==1){ - // printf("jb %5d damn\n",jb); - // fflush(stdout); - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->nprow; ++j){ - if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; - ++rank_cnt; - } - } + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } - if(rank_cnt>1){ + if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); - BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d'); + // rseed=rand(); + // rseed=1.0; + msgsize = SuperSize( jb )*nrhs+XK_H; + LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d'); - // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); - // if(iam==15 || iam==3){ - // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'d')); - // fflush(stdout); - // } + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'d')); + // fflush(stdout); + // } - // #if ( PRNTlevel>=1 ) - if(Root==myrow){ - rank_cnt_ref=1; - for (j = 0; j < grid->nprow; ++j) { - if ( fsendx_plist[ljb][j] != EMPTY ) { - ++rank_cnt_ref; - } + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; } - assert(rank_cnt==rank_cnt_ref); + } + assert(rank_cnt==rank_cnt_ref); - // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); - // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); #endif - + #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Reduce tree for L ... */ - /* the following is used as reference */ - nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - pr = PROW( k, grid ); - if ( myrow == pr ) { - lib = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if (mycol == kcol || fmod[lib] ) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - ABORT("Malloc fails for LRtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); - // if ( !(idxs = intCalloc_dist(nsupers)) ) - // ABORT("Calloc fails for idxs[]."); + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); - // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - // ABORT("Malloc fails for nzrows[]."); + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); - if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_RD[]."); + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); - for (i=0;irscp.comm); + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); - for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; - if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; - - - - for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); - } + + + for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); } } } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; - MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } - for (lib=0;libnprow; /* not sure */ - if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; - for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; - for (j=0;jnpcol;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - - for (j=0;jnpcol;++j){ - if(ActiveFlag[j]!=-3*nsupers){ - jb = ActiveFlag[j]; - pc = PCOL( jb, grid ); - if(jb==ib)Root=pc; - if(mycol==pc)Iactive=1; - } - } - - - quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); - - if(Iactive==1){ - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->npcol; ++j){ - if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; - ++rank_cnt; - } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; } - if(rank_cnt>1){ + } + if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'d'); + // } - LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); - RdTree_SetTag(LRtree_ptr[lib], RD_L,'d'); - // } + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); - // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); - - #if ( PRNTlevel>=1 ) - if(Root==mycol){ - assert(rank_cnt==frecv[lib]); - // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;j=1 ) + if(Root==mycol){ + assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Bcast tree for U ... */ + /* construct the Bcast tree for U ... */ - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - ABORT("Malloc fails for UBtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); - if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_BC[]."); + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); - for (i=0;icscp.comm); + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm); - for (ljb = 0; ljb nprow*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; - - + if ( !(ActiveFlagAll = intMalloc_dist(grid->nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ - for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ - ib = myrow+lib*grid->nprow; /* not sure */ - - // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); - // fflush(stdout); - - if(ibnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); - } - } /* for i ... */ - pr = PROW( ib, grid ); // take care of diagonal node stored as L - pc = PCOL( ib, grid ); - if ( mycol == pc ) { /* Block column ib in my process column */ - ljb = LBj( ib, grid ); /* local block number */ - ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); - // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); - // fflush(stdout); - } - } - } + // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); + // fflush(stdout); - // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); - // fflush(stdout); + if(ibnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + } + } /* for i ... */ + pr = PROW( ib, grid ); // take care of diagonal node stored as L + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ljb = LBj( ib, grid ); /* local block number */ + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); + // fflush(stdout); + } + } + } + + // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); + // fflush(stdout); + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } - MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); - - for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ - jb = mycol+ljb*grid->npcol; /* not sure */ - if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; - for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; - for (j=0;jnprow;++j)ranks[j]=-1; - - Root=-1; - Iactive = 0; - for (j=0;jnprow;++j){ - if(ActiveFlag[j]!=-3*nsupers){ - gb = ActiveFlag[j]; - pr = PROW( gb, grid ); - if(gb==jb)Root=pr; - if(myrow==pr)Iactive=1; + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); + fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; } - } - - quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); - // printf("jb: %5d Iactive %5d\n",jb,Iactive); - // fflush(stdout); - if(Iactive==1){ - // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); - fflush(stdout); - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->nprow; ++j){ - if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; - ++rank_cnt; - } - } - // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); - // fflush(stdout); - if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); - BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d'); - - // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); // fflush(stdout); - - if(Root==myrow){ - rank_cnt_ref=1; - for (j = 0; j < grid->nprow; ++j) { - // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); - // fflush(stdout); - if ( bsendx_plist[ljb][j] != EMPTY ) { - ++rank_cnt_ref; - } + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; } - // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); - // fflush(stdout); - assert(rank_cnt==rank_cnt_ref); - } } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } } - } - } - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); - SUPERLU_FREE(SeedSTD_BC); - + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* construct the Reduce tree for U ... */ - /* the following is used as reference */ - nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for mod_bit[]."); - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - pr = PROW( k, grid ); - if ( myrow == pr ) { - lib = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if (mycol == kcol || bmod[lib] ) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - ABORT("Malloc fails for URtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) - ABORT("Malloc fails for ranks[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); - // if ( !(idxs = intCalloc_dist(nsupers)) ) - // ABORT("Calloc fails for idxs[]."); + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); - // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - // ABORT("Malloc fails for nzrows[]."); + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); - if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_RD[]."); + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); - for (i=0;irscp.comm); + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); - for (lib = 0; lib npcol*k)) ) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; - - for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ - ib = myrow+lib*grid->nprow; /* not sure */ - if(ibnpcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); - } - } /* for i ... */ - pc = PCOL( ib, grid ); - if ( mycol == pc ) { /* Block column ib in my process column */ - ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); - } - } - } - - MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); - - for (lib=0;libnprow; /* not sure */ - if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; - for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; - for (j=0;jnpcol;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - - for (j=0;jnpcol;++j){ - if(ActiveFlag[j]!=3*nsupers){ - jb = ActiveFlag[j]; - pc = PCOL( jb, grid ); - if(jb==ib)Root=pc; - if(mycol==pc)Iactive=1; - } - } - - quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); - - if(Iactive==1){ - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->npcol; ++j){ - if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; - ++rank_cnt; - } + + if ( !(ActiveFlagAll = intMalloc_dist(grid->npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + if(ibnpcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } /* for i ... */ + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; } - if(rank_cnt>1){ + } + if(rank_cnt>1){ - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); - RdTree_SetTag(URtree_ptr[lib], RD_U,'d'); - // } - - // #if ( PRNTlevel>=1 ) - if(Root==mycol){ - // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); - // fflush(stdout); - assert(rank_cnt==brecv[lib]); - // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;jcomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'d'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); #endif - - //////////////////////////////////////////////////////// - - - + + //////////////////////////////////////////////////////// + /* Free the memory used for storing L and U */ SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); if (lsub != NULL) @@ -2754,6 +2741,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, if (usub != NULL) SUPERLU_FREE(usub); + SUPERLU_FREE(nnzToRecv); SUPERLU_FREE(ptrToRecv); SUPERLU_FREE(nnzToSend); @@ -2761,10 +2749,10 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, SUPERLU_FREE(recvBuf); Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Linv_bc_ptr = Linv_bc_ptr; - Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; @@ -2780,7 +2768,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; + LUstruct->Glu_persist = Glu_persist; Llu->LRtree_ptr = LRtree_ptr; Llu->LBtree_ptr = LBtree_ptr; Llu->URtree_ptr = URtree_ptr; @@ -2788,7 +2776,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Llu->Urbs = Urbs; Llu->Ucb_indptr = Ucb_indptr; Llu->Ucb_valptr = Ucb_valptr; - + #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); diff --git a/SRC/pdutil.c b/SRC/pdutil.c index 096421b9..0a891bf4 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -533,17 +533,6 @@ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); - fflush(stdout); - - // while(1); - - // if(err>1e-5){ - // if( !iam ) printf("Wrong solution! \n"); - // fflush(stdout); - // while(1); - - // ABORT("Wrong solution! \n"); -// } } } @@ -558,7 +547,7 @@ dDestroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) #if ( DEBUGlevel>=1 ) int iam; MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - CHECK_MALLOC(iam, "Enter Destroy_LU()"); + CHECK_MALLOC(iam, "Enter dDestroy_Tree()"); #endif nsupers = Glu_persist->supno[n-1] + 1; @@ -592,4 +581,4 @@ dDestroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dDestroy_Tree()"); #endif -} +} \ No newline at end of file diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c index 2bed7bcc..d9521f94 100644 --- a/SRC/pzdistribute.c +++ b/SRC/pzdistribute.c @@ -20,6 +20,9 @@ at the top-level directory. #include "superlu_zdefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif /*! \brief * @@ -319,7 +322,7 @@ float pzdistribute(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, - gridinfo_t *grid) + gridinfo_t *grid, int_t nrhs) /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. @@ -368,32 +371,46 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol, mycol, myrow, pc, pr; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; int_t mybufmax[NBUFFERS]; NRformat_loc *Astore; doublecomplex *a; int_t *asub, *xa; + int_t *xa_begin, *xa_end; int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; int_t nsupers; int_t next_lind; /* next available position in index[*] */ int_t next_lval; /* next available position in nzval[*] */ int_t *index; /* indices consist of headers and row subscripts */ - int *index1; /* temporary pointer to array of int */ - doublecomplex *lusup, *uval; /* nonzero values in L and U */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + doublecomplex *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; @@ -422,12 +439,32 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + doublecomplex *dense, *dense_col; /* SPA */ doublecomplex zero = {0.0, 0.0}; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; float mem_use = 0.0; + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; + int_t nub; + int tag; + #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif @@ -444,10 +481,11 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, nsupers = supno[n-1] + 1; Astore = (NRformat_loc *) A->Store; -#if ( PRNTlevel>=1 ) +//#if ( PRNTlevel>=1 ) iword = sizeof(int_t); dword = sizeof(doublecomplex); -#endif + aln_i = ceil(CACHELINE/(double)iword); +//#endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzdistribute()"); @@ -482,6 +520,7 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) ABORT("Malloc fails for Urb_indptr[]."); Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; @@ -780,6 +819,24 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k-1] = NULL; + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + + /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) ABORT("Malloc fails for fsendx_plist[]."); @@ -945,14 +1002,16 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, index[] and nzval[]. */ /* Add room for descriptors */ len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - Lrowind_bc_ptr[ljb] = index; - if (!(Lnzval_bc_ptr[ljb] = - doublecomplexMalloc_dist(len*nsupc))) { - fprintf(stderr, "col block " IFMT " ", jb); - ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); - } + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); @@ -964,6 +1023,9 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, gb = Lrb_number[k]; lb = LBi( gb, grid ); len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; Lrb_length[lb] = 0; /* Reset vector of block length */ index[next_lind++] = gb; /* Descriptor */ index[next_lind++] = len; @@ -974,7 +1036,6 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, } /* Propagate the compressed row subscripts to Lindex[], and the initial values of A from SPA into Lnzval[]. */ - lusup = Lnzval_bc_ptr[ljb]; len = index[1]; /* LDA of lusup[] */ for (i = istart; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; @@ -993,9 +1054,78 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, } } } /* for i ... */ + + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) t_l += SuperLU_timer_() - t; @@ -1004,7 +1134,746 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, } /* for jb ... */ + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + ///////////////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'z')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'z'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'z')); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'z'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; @@ -1022,6 +1891,17 @@ pzdistribute(fact_t fact, int_t n, SuperMatrix *A, Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + + #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c index dc382bcb..b151a71c 100644 --- a/SRC/pzgssvx.c +++ b/SRC/pzgssvx.c @@ -239,7 +239,7 @@ at the top-level directory. * The user must also supply * * o A, the unfactored matrix, only in the case that iterative - * refinment is to be done (specifically A must be the output + * refinement is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of @@ -342,7 +342,7 @@ at the top-level directory. * = SLU_DOUBLE: accumulate residual in double precision. * = SLU_EXTRA: accumulate residual in extra precision. * - * NOTE: all options must be indentical on all processes when + * NOTE: all options must be identical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) @@ -467,7 +467,7 @@ at the top-level directory. * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. - * This pattern should be intialized only once for repeated solutions. + * This pattern should be initialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_zdefs.h for the definition of 'SOLVEstruct_t'. @@ -550,7 +550,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; - + int blas_flag; + /* Initialization. */ m = A->nrow; n = A->ncol; @@ -649,8 +650,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, } /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. (simple scheme) - ------------------------------------------------------------*/ + * Diagonal scaling to equilibrate the matrix. (simple scheme) + * for row i = 1:n, A(i,:) <- A(i,:) / max(abs(A(i,:)); + * for column j = 1:n, A(:,j) <- A(:, j) / max(abs(A(:,j)) + * ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); @@ -974,7 +977,11 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( permc_spec != MY_PERMC && Fact == DOFACT ) { /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */ if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * + // #pragma omp parallel + // { + // #pragma omp master + // { + /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * @@ -982,6 +989,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); + // } + // } if (flinfo > 0) { #if ( PRNTlevel>=1 ) fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n"); @@ -1104,7 +1113,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pzdistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); + Glu_freeable, LUstruct, grid, nrhs); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ @@ -1121,7 +1130,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, t = SuperLU_timer_(); dist_mem_use = zdist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); + &Pslu_freeable, LUstruct, grid, nrhs); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); @@ -1132,9 +1141,15 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); + // #pragma omp parallel + // { + // #pragma omp master + // { pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; - + // } + // } + #if 0 // #ifdef GPU_PROF @@ -1304,11 +1319,30 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, For repeated call to pzgssvx(), no need to re-initialilze the Solve data & communication structures, unless a new factorization with Fact == DOFACT or SamePattern is asked for. */ + if(options->DiagInv==YES){ + + #ifdef _CRAY + blas_flag=1; + #elif defined (USE_VENDOR_BLAS) + blas_flag=2; + #else + blas_flag=0; + #endif + if(blas_flag==0) + ABORT("DiagInv doesn't works with internal blas\n"); + pzCompute_Diag_Inv(n, LUstruct, grid, stat, info); + } } + // #pragma omp parallel + // { + // #pragma omp master + // { pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); - + // } + // } + /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c index 5a11f84b..d82c179a 100644 --- a/SRC/pzgstrs.c +++ b/SRC/pzgstrs.c @@ -19,8 +19,11 @@ at the top-level directory. * October 15, 2008 * */ - +#include #include "superlu_zdefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif /* * Sketch of the algorithm for L-solve: @@ -159,10 +162,12 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, int_t *send_ibuf, *recv_ibuf; doublecomplex *send_dbuf, *recv_dbuf; int_t *xsup, *supno; - int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; + int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; int p, procs; pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_B_to_X()"); #endif @@ -197,6 +202,14 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)* (size_t)nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; @@ -217,7 +230,7 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, ++ptr_to_ibuf[p]; ptr_to_dbuf[p] += nrhs; } - +#if 1 /* Communicate the (permuted) row indices. */ MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); @@ -226,7 +239,20 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); - +#else + + /* Communicate the (permuted) row indices. */ + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i); + + /* Communicate the numerical values. */ + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, + grid->comm, &req_d); + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); + +#endif /* ------------------------------------------------------------ Copy buffer into X on the diagonal processes. ------------------------------------------------------------*/ @@ -253,6 +279,10 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_B_to_X()"); @@ -293,7 +323,10 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; int iam, p, q, pkk, procs; int_t num_diag_procs, *diag_procs; - + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp; + #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Enter pzReDistribute_X_to_B()"); #endif @@ -325,7 +358,15 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f recv_ibuf = send_ibuf + k; if ( !(send_dbuf = doublecomplexMalloc_dist((k + l)*nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + recv_dbuf = send_dbuf + k * nrhs; for (p = 0; p < procs; ++p) { ptr_to_ibuf[p] = sdispls[p]; ptr_to_dbuf[p] = sdispls_nrhs[p]; @@ -365,12 +406,22 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f /* ------------------------------------------------------------ COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. ------------------------------------------------------------*/ +#if 1 MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm); - +#else + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, SuperLU_MPI_DOUBLE_COMPLEX, + grid->comm,&req_d); + + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); +#endif /* ------------------------------------------------------------ COPY THE BUFFER INTO B. ------------------------------------------------------------*/ @@ -384,6 +435,10 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f SUPERLU_FREE(send_ibuf); SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(grid->iam, "Exit pzReDistribute_X_to_B()"); #endif @@ -391,6 +446,123 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f } /* pzReDistribute_X_to_B */ + + + + void +pzCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + + doublecomplex *lusup; + doublecomplex *recvbuf, *tempv; + doublecomplex *Linv;/* Inverse of diagonal block */ + doublecomplex *Uinv;/* Inverse of diagonal block */ + + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb,nlb_nodiag, nub, nsupers; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + doublecomplex **Lnzval_bc_ptr; + doublecomplex **Linv_bc_ptr; + doublecomplex **Uinv_bc_ptr; + int INFO; + double t; + + doublecomplex one = {1.0, 0.0}; + doublecomplex zero = {0.0, 0.0}; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + if(grid->iam==0){ + printf("computing inverse of diagonal blocks...\n"); + fflush(stdout); + } + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + + Llu->inv = 1; + + /*--------------------------------------------------- + * Compute inverse of L(lk,lk). + *---------------------------------------------------*/ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* diagonal process */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + Linv = Linv_bc_ptr[lk]; + Uinv = Uinv_bc_ptr[lk]; + nsupr = lsub[1]; + knsupc = SuperSize( k ); + + for (j=0 ; j=1 ) + if(grid->iam==0){ + t = SuperLU_timer_() - t; + printf(".. L-diag_inv time\t%10.5f\n", t); + fflush(stdout); + } +#endif + + return; +} + + + /*! \brief * *
@@ -473,59 +645,139 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
     doublecomplex alpha = {1.0, 0.0};
+	doublecomplex beta = {0.0, 0.0};
     doublecomplex zero = {0.0, 0.0};
     doublecomplex *lsum;  /* Local running sum of the updates to B-components */
     doublecomplex *x;     /* X component at step k. */
 		    /* NOTE: x and lsum are of same size. */
     doublecomplex *lusup, *dest;
-    doublecomplex *recvbuf, *tempv;
-    doublecomplex *rtemp; /* Result of full matrix-vector multiply. */
-    int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    int_t  *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
-    int_t  kcol, krow, mycol, myrow;
-    int_t  i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
-    int_t  nb, nlb, nub, nsupers;
-    int_t  *xsup, *supno, *lsub, *usub;
-    int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
-    int    Pc, Pr, iam;
-    int    knsupc, nsupr;
-    int    ldalsum;   /* Number of lsum entries locally owned. */
-    int    maxrecvsz, p, pi;
-    int_t  **Lrowind_bc_ptr;
-    doublecomplex **Lnzval_bc_ptr;
-    MPI_Status status;
-    MPI_Request *send_req, recv_req;
-    pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
-
-    /*-- Counts used for L-solve --*/
-    int_t  *fmod;         /* Modification count for L-solve --
-                             Count the number of local block products to
-                             be summed into lsum[lk]. */
-    int_t  **fsendx_plist = Llu->fsendx_plist;
-    int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
-    int_t  *frecv;        /* Count of lsum[lk] contributions to be received
-                             from processes in this row. 
-                             It is only valid on the diagonal processes. */
-    int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
-    int_t  nleaf = 0, nroot = 0;
-
-    /*-- Counts used for U-solve --*/
-    int_t  *bmod;         /* Modification count for U-solve. */
-    int_t  **bsendx_plist = Llu->bsendx_plist;
-    int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
-    int_t  *brecv;        /* Count of modifications to be recv'd from
-			     processes in this row. */
-    int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
-    double t;
+	doublecomplex *recvbuf,*recvbuf_on, *tempv, *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin;
+	doublecomplex *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */
+	doublecomplex *Linv; /* Inverse of diagonal block */
+	doublecomplex *Uinv; /* Inverse of diagonal block */
+	int *ipiv; 
+	int_t *leaf_send;
+	int_t nleaf_send, nleaf_send_tmp;
+	int_t *root_send;
+	int_t nroot_send, nroot_send_tmp;
+	
+	int_t  **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	BcTree  *LBtree_ptr = Llu->LBtree_ptr;
+	RdTree  *LRtree_ptr = Llu->LRtree_ptr;
+	BcTree  *UBtree_ptr = Llu->UBtree_ptr;
+	RdTree  *URtree_ptr = Llu->URtree_ptr;	
+	int_t  *Urbs1, *Urbs2; /* Number of row blocks in each block column of U. */
+	int_t  *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */
+	Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+	int_t  **Ucb_valptr = Llu->Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+	int_t  kcol, krow, mycol, myrow;
+	int_t  i, ii, il, j, jj, k, kk, lb, ljb, lk, lib, lptr, luptr, gb, nn;
+	int_t  nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i;
+	int_t  *xsup, *supno, *lsub, *usub;
+	int_t  *ilsum;    /* Starting position of each supernode in lsum (LOCAL)*/
+	int    Pc, Pr, iam;
+	int    knsupc, nsupr, nprobe;
+	int    nbtree, nrtree, outcount;
+	int    ldalsum;   /* Number of lsum entries locally owned. */
+	int    maxrecvsz, p, pi;
+	int_t  **Lrowind_bc_ptr;
+	doublecomplex **Lnzval_bc_ptr;
+	doublecomplex **Linv_bc_ptr;
+	doublecomplex **Uinv_bc_ptr;
+	doublecomplex sum;
+	MPI_Status status,status_on,statusx,statuslsum;
+	MPI_Request *send_req, recv_req, req;
+	pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm;
+	SuperLUStat_t **stat_loc;
+
+	double tmax;
+
+	/*-- Counts used for L-solve --*/
+	int_t  *fmod;         /* Modification count for L-solve --
+				 Count the number of local block products to
+				 be summed into lsum[lk]. */
+	int_t fmod_tmp;
+	int_t  **fsendx_plist = Llu->fsendx_plist;
+	int_t  nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
+	int_t  nfrecvx_buf=0;						 					 
+	int_t  *frecv;        /* Count of lsum[lk] contributions to be received
+				 from processes in this row. 
+				 It is only valid on the diagonal processes. */
+	int_t  frecv_tmp;
+	int_t  nfrecvmod = 0; /* Count of total modifications to be recv'd. */
+	int_t  nfrecv = 0; /* Count of total messages to be recv'd. */
+	int_t  nbrecv = 0; /* Count of total messages to be recv'd. */
+	int_t  nleaf = 0, nroot = 0;
+	int_t  nleaftmp = 0, nroottmp = 0;
+	int_t  msgsize;
+	/*-- Counts used for U-solve --*/
+	int_t  *bmod;         /* Modification count for U-solve. */
+	int_t bmod_tmp;
+	int_t  **bsendx_plist = Llu->bsendx_plist;
+	int_t  nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
+	int_t  nbrecvx_buf=0;		
+	int_t  *brecv;        /* Count of modifications to be recv'd from
+				 processes in this row. */
+	int_t  nbrecvmod = 0; /* Count of total modifications to be recv'd. */
+	int_t flagx,flaglsum,flag;
+	int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; 
+	int_t TAG;
+	double t1_sol, t2_sol, t;
 #if ( DEBUGlevel>=2 )
-    int_t Ublocks = 0;
+	int_t Ublocks = 0;
 #endif
 
-    int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
- 
-    t = SuperLU_timer_();
+	int_t gik,iklrow,fnz;
+	
+	int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */
+	int INFO, pad;
+	int_t tmpresult;
+
+	// #if ( PROFlevel>=1 )
+	double t1, t2;
+	float msg_vol = 0, msg_cnt = 0;
+	// #endif 
+
+	int_t *msgcnt=(int_t *) SUPERLU_MALLOC(4 * sizeof(int_t));   /* Count the size of the message xfer'd in each buffer:
+								      *     0 : transferred in Lsub_buf[]
+								      *     1 : transferred in Lval_buf[]
+								      *     2 : transferred in Usub_buf[]
+								      *     3 : transferred in Uval_buf[]
+								      */
+	int iword = sizeof (int_t);
+	int dword = sizeof (double);	
+	int Nwork;
+
+	yes_no_t done;
+	yes_no_t startforward;
+
+	int nbrow;
+	int_t  ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc;
+	int_t  lptr1_tmp, idx_i, idx_v,m; 
+
+	int_t thread_id,ready;
+	yes_no_t empty;
+	int_t sizelsum,sizertemp,aln_d,aln_i;
+
+	aln_d = ceil(CACHELINE/(double)dword);
+	aln_i = ceil(CACHELINE/(double)iword);
+	int num_thread = 1;
+#ifdef _OPENMP
+#pragma omp parallel default(shared)
+	{
+		if (omp_get_thread_num () == 0) {
+			num_thread = omp_get_num_threads ();
+		}
+	}
+#endif
+	if(grid->iam==0){
+		printf("num_thread: %5d\n",num_thread);
+		fflush(stdout);
+	}
+
+	MPI_Barrier( grid->comm );
+	TIC(t1_sol);
+	t = SuperLU_timer_();
 
     /* Test input parameters. */
     *info = 0;
@@ -549,8 +801,15 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     nsupers = supno[n-1] + 1;
     Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Linv_bc_ptr = Llu->Linv_bc_ptr;
+	Uinv_bc_ptr = Llu->Uinv_bc_ptr;	
     nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
 
+	stat->utime[SOL_COMM] = 0.0;
+	stat->utime[SOL_GEMM] = 0.0;
+	stat->utime[SOL_TRSM] = 0.0;
+	stat->utime[SOL_L] = 0.0;	
+	
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pzgstrs()");
 #endif
@@ -563,13 +822,16 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     if ( !(fmod = intMalloc_dist(nlb)) )
 	ABORT("Calloc fails for fmod[].");
     for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
-    if ( !(frecv = intMalloc_dist(nlb)) )
+    if ( !(frecv = intCalloc_dist(nlb)) )
 	ABORT("Malloc fails for frecv[].");
     Llu->frecv = frecv;
 
-    k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
-    if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) )
-	ABORT("Malloc fails for send_req[].");
+	if ( !(leaf_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) )													  
+		ABORT("Malloc fails for leaf_send[].");
+	nleaf_send=0;
+	if ( !(root_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) )
+		ABORT("Malloc fails for root_send[].");
+	nroot_send=0;
 
 #ifdef _CRAY
     ftcs1 = _cptofcd("L", strlen("L"));
@@ -585,14 +847,54 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
     /* Allocate working storage. */
     knsupc = sp_ienv_dist(3);
     maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
-    if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) )
-	ABORT("Calloc fails for lsum[].");
+	sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H);
+	sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d;
+
+
+	
+#ifdef _OPENMP
+	if ( !(lsum = (doublecomplex*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(doublecomplex))))
+		ABORT("Malloc fails for lsum[].");	
+#pragma omp parallel default(shared) private(thread_id,ii)
+	{
+		thread_id = omp_get_thread_num ();
+		for(ii=0;ii=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t);
+	fflush(stdout);
+	t = SuperLU_timer_();
+#endif	
+
     /* Set up the headers in lsum[]. */
     ii = 0;
     for (k = 0; k < nsupers; ++k) {
@@ -616,735 +925,1216 @@ pzgstrs(int_t n, LUstruct_t *LUstruct,
 	ii += knsupc;
     }
 
-    /*
-     * Compute frecv[] and nfrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
-
-#if 1
-	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* local block number */
-		kcol = PCOL( k, grid );
-		if ( mycol != kcol && fmod[lk] )
-		    mod_bit[lk] = 1;  /* contribution from off-diagonal */
-	    }
+	/* ---------------------------------------------------------
+	   Initialize the async Bcast trees on all processes.
+	   --------------------------------------------------------- */		
+	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+	nbtree = 0;
+	for (lk=0;lk0)nfrecvx_buf++;				  
+			}
+			BcTree_allocateRequest(LBtree_ptr[lk],'z');
+		}
 	}
-	/*PrintInt10("mod_bit", nlb, mod_bit);*/
-	
-#if ( PROFlevel>=2 )
-	t_reduce_tmp = SuperLU_timer_();
-#endif
-	/* Every process receives the count, but it is only useful on the
-	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
 
-#if ( PROFlevel>=2 )
-	t_reduce += SuperLU_timer_() - t_reduce_tmp;
-#endif
-
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* local block number */
-		kcol = PCOL( k, grid );
-		if ( mycol == kcol ) { /* diagonal process */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+	nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	if ( !(	leafsups = (int_t*)intCalloc_dist(nsupers_i)) )
+		ABORT("Calloc fails for leafsups.");
+
+	nrtree = 0;
+	nleaf=0;
+	for (lk=0;lknprow;  /* not sure */
+			if(gbcomm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nfrecvmod += frecv[lk];
-		    if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
+	if ( !(recvbuf_BC_fwd = (doublecomplex*)SUPERLU_MALLOC(maxrecvsz*(nfrecvx+1) * sizeof(doublecomplex))) )  // this needs to be optimized for 1D row mapping
+		ABORT("Malloc fails for recvbuf_BC_fwd[].");	
+	nfrecvx_buf=0;			
+									
 #if ( DEBUGlevel>=2 )
-		    printf("(%2d) frecv[%4d]  %2d\n", iam, k, frecv[lk]);
-		    assert( frecv[lk] < Pc );
-#endif
-		}
-	    }
-	}
+	printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n,  nbtree %4d\n,  nrtree %4d\n",
+			iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree);
+	fflush(stdout);
 #endif
-    }
 
-    /* ---------------------------------------------------------
-       Solve the leaf nodes first by all the diagonal processes.
-       --------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nleaf %4d\n", iam, nleaf);
+
+
+#if ( PRNTlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t);
+	fflush(stdout);
+	MPI_Barrier( grid->comm );	
+	t = SuperLU_timer_();
 #endif
-    for (k = 0; k < nsupers && nleaf; ++k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    if ( frecv[lk]==0 && fmod[lk]==0 ) {
-		fmod[lk] = -1;  /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
+
+
+#if ( VAMPIR>=1 )
+	// VT_initialize(); 
+	VT_traceon();	
 #endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nleaf;
+
+
+	/* ---------------------------------------------------------
+	   Solve the leaf nodes first by all the diagonal processes.
+	   --------------------------------------------------------- */
 #if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
+	printf("(%2d) nleaf %4d\n", iam, nleaf);
+	fflush(stdout);
 #endif
-		
-		/*
-		 * Send Xk to process column Pc[k].
-		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( fsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
 
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#if 0
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				 SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared) 
 #endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
+	{	
+#ifdef _OPENMP
+#pragma omp master
 #endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		 */
-		nb = lsub[0] - 1;
-		lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		luptr = knsupc; /* Skip diagonal block L(k,k). */
-		
-		zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			   fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			   send_req, stat);
-	    }
-	} /* if diagonal process ... */
-    } /* for k ... */
+		{
 
-    /* -----------------------------------------------------------
-       Compute the internal nodes asynchronously by all processes.
-       ----------------------------------------------------------- */
-#if ( DEBUGlevel>=2 )
-    printf("(%2d) nfrecvx %4d,  nfrecvmod %4d,  nleaf %4d\n",
-	   iam, nfrecvx, nfrecvmod, nleaf);
+#ifdef _OPENMP
+#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nogroup	
 #endif
+			for (jj=0;jj=1 )
+					TIC(t1);
+#endif	 
+#ifdef _OPENMP
+					thread_id = omp_get_thread_num ();
+#else
+					thread_id = 0;
+#endif
+					rtemp_loc = &rtemp[sizertemp* thread_id];
 
-    while ( nfrecvx || nfrecvmod ) { /* While not finished. */
 
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
+					knsupc = SuperSize( k );
+					lk = LBi( k, grid );
 
-        k = (*recvbuf).r;
+					// if ( frecv[lk]==0 && fmod[lk]==0 ) { 
+					// fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+					ii = X_BLK( lk );
+					lk = LBj( k, grid ); /* Local block number, column-wise. */
+					lsub = Lrowind_bc_ptr[lk];
+					lusup = Lnzval_bc_ptr[lk];
 
-#if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
-#endif
-	
-	switch ( status.MPI_TAG ) {
-	  case Xk:
-	      --nfrecvx;
-	      lk = LBj( k, grid ); /* Local block number, column-wise. */
-	      lsub = Lrowind_bc_ptr[lk];
-	      lusup = Lnzval_bc_ptr[lk];
-	      if ( lsub ) {
-		  nb   = lsub[0];
-		  lptr = BC_HEADER;
-		  luptr = 0;
-		  knsupc = SuperSize( k );
-
-		  /*
-		   * Perform local block modifications: lsum[i] -= L_i,k * X[k]
-		   */
-		  zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu, 
-			     send_req, stat);
-	      } /* if lsub */
-
-	      break;
-
-	  case LSUM: /* Receiver must be a diagonal process */
-	      --nfrecvmod;
-	      lk = LBi( k, grid ); /* Local block number, row-wise. */
-	      ii = X_BLK( lk );
-	      knsupc = SuperSize( k );
-	      tempv = &recvbuf[LSUM_H];
-	      RHS_ITERATE(j) {
-		  for (i = 0; i < knsupc; ++i)
-		      z_add(&x[i + ii + j*knsupc],
-			    &x[i + ii + j*knsupc],
-			    &tempv[i + j*knsupc]);
-	      }
-
-	      if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
-		  fmod[lk] = -1; /* Do not solve X[k] in the future. */
-		  lk = LBj( k, grid ); /* Local block number, column-wise. */
-		  lsub = Lrowind_bc_ptr[lk];
-		  lusup = Lnzval_bc_ptr[lk];
-		  nsupr = lsub[1];
+					nsupr = lsub[1];
+
+
+
+					if(Llu->inv == 1){
+						Linv = Linv_bc_ptr[lk];
 #ifdef _CRAY
-		  CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
-			lusup, &nsupr, &x[ii], &knsupc);
+						CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+								&alpha, Linv, &knsupc, &x[ii],
+								&knsupc, &beta, rtemp_loc, &knsupc );
 #elif defined (USE_VENDOR_BLAS)
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
+						zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+								&alpha, Linv, &knsupc, &x[ii],
+								&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
 #else
-		  ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, 
-			 lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		  stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
-		      + 10 * knsupc * nrhs; /* complex division */
-#if ( DEBUGlevel>=2 )
-		  printf("(%2d) Solve X[%2d]\n", iam, k);
-#endif
-		
-		  /*
-		   * Send Xk to process column Pc[k].
-		   */
-		  kcol = PCOL( k, grid );
-		  for (p = 0; p < Pr; ++p) {
-		      if ( fsendx_plist[lk][p] != EMPTY ) {
-			  pi = PNUM( p, kcol, grid );
-
-			  MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
-                                     SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                     &send_req[Llu->SolveMsgSent++]);
-#if 0
-			  MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-				    SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm );
+						zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+								&alpha, Linv, &knsupc, &x[ii],
+								&knsupc, &beta, rtemp_loc, &knsupc );
+#endif		   
+						for (i=0 ; i=1 )
+					TOC(t2, t1);
+					stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+
+#endif	
+
+					stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+					+ 10 * knsupc * nrhs; /* complex division */
+			
+					
+					// --nleaf;
 #if ( DEBUGlevel>=2 )
-			  printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				 iam, x[ii-XK_H], pi);
+					printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		      }
-                  }
-		  /*
-		   * Perform local block modifications.
-		   */
-		  nb = lsub[0] - 1;
-		  lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
-		  luptr = knsupc; /* Skip diagonal block L(k,k). */
-
-		  zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
-			     fmod, nb, lptr, luptr, xsup, grid, Llu,
-			     send_req, stat);
-	      } /* if */
-
-	      break;
 
-#if ( DEBUGlevel>=2 )
-	    default:
-	      printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
-	      break;
+					/*
+					 * Send Xk to process column Pc[k].
+					 */
+
+					if(LBtree_ptr[lk]!=NULL){ 
+						lib = LBi( k, grid ); /* Local block number, row-wise. */
+						ii = X_BLK( lib );	
+
+#ifdef _OPENMP
+#pragma omp atomic capture
 #endif
-	  } /* switch */
+						nleaf_send_tmp = ++nleaf_send;
+						leaf_send[nleaf_send_tmp-1] = lk;
+						// BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
+					}
+				}		
+				}
+			}	
+		}
 
-    } /* while not finished ... */
 
+#if ( VTUNE>=1 )
+		__itt_resume();
+#endif
 
-#if ( PRNTlevel>=2 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
+		jj=0;
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id)
+		{
+			thread_id = omp_get_thread_num ();
+#else
+			{
+				thread_id = 0;
 #endif
 
-#if ( DEBUGlevel==2 )
-    {
-      printf("(%d) .. After L-solve: y =\n", iam);
-      for (i = 0, k = 0; k < nsupers; ++k) {
-	  krow = PROW( k, grid );
-	  kcol = PCOL( k, grid );
-	  if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
-	      knsupc = SuperSize( k );
-	      lk = LBi( k, grid );
-	      ii = X_BLK( lk );
-	      for (j = 0; j < knsupc; ++j)
-		printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
-	      fflush(stdout);
-	  }
-	  MPI_Barrier( grid->comm );
-      }
-    }
+#ifdef _OPENMP
+#pragma omp master
 #endif
+				{
 
-    SUPERLU_FREE(fmod);
-    SUPERLU_FREE(frecv);
-    SUPERLU_FREE(rtemp);
+#ifdef _OPENMP
+#pragma	omp	taskloop private (i,k,ii,knsupc,lk,nb,lptr,luptr,lsub,lusup,thread_id) untied num_tasks(num_thread*8) nogroup
+#endif
 
-    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+					for (jj=0;jjSolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
-    Llu->SolveMsgSent = 0;
+						// #ifdef _OPENMP
+						// #pragma	omp	task firstprivate (k) private (ii,knsupc,lk,nb,lptr,luptr,lsub,lusup,thread_id) untied	 	
+						// #endif
+						{
 
-    MPI_Barrier( grid->comm );
+#ifdef _OPENMP
+							thread_id = omp_get_thread_num ();
+#else
+							thread_id = 0;
+#endif								
+
+							/* Diagonal process */
+							knsupc = SuperSize( k );
+							lk = LBi( k, grid );
+
+							// if ( frecv[lk]==0 && fmod[lk]==0 ) { 
+							// fmod[lk] = -1;  /* Do not solve X[k] in the future. */
+							ii = X_BLK( lk );
+							lk = LBj( k, grid ); /* Local block number, column-wise. */
+							lsub = Lrowind_bc_ptr[lk];
+							lusup = Lnzval_bc_ptr[lk];
+
+							/*
+							 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+							 */
+							nb = lsub[0] - 1;
+							zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
+									fmod, nb, xsup, grid, Llu, 
+									stat_loc, leaf_send, &nleaf_send,sizelsum,sizertemp,0);	
+						}
+
+						// } /* if diagonal process ... */
+					} /* for k ... */
+				}
 
+			}
 
-    /*---------------------------------------------------
-     * Back solve Ux = y.
-     *
-     * The Y components from the forward solve is already
-     * on the diagonal processes.
-     *---------------------------------------------------*/
+			for (i=0;i=0){ // this is a bcast forwarding
+					gb = mycol+lk*grid->npcol;  /* not sure */
+					lib = LBi( gb, grid ); /* Local block number, row-wise. */
+					ii = X_BLK( lib );			
+					BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
+				}else{ // this is a reduce forwarding
+					lk = -lk - 1;
+					il = LSUM_BLK( lk );
+					RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],'z');
+				}
+			}
 
-    /* Save the count to be altered so it can be used by
-       subsequent call to PZGSTRS. */
-    if ( !(bmod = intMalloc_dist(nlb)) )
-	ABORT("Calloc fails for bmod[].");
-    for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
-    if ( !(brecv = intMalloc_dist(nlb)) )
-	ABORT("Malloc fails for brecv[].");
-    Llu->brecv = brecv;
 
-    /*
-     * Compute brecv[] and nbrecvmod counts on the diagonal processes.
-     */
-    {
-	superlu_scope_t *scp = &grid->rscp;
 
-#if 1
-	for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* local block number */
-		kcol = PCOL( k, grid ); /* root process in this row scope */
-		if ( mycol != kcol && bmod[lk] )
-		    mod_bit[lk] = 1;  /* Contribution from off-diagonal */
-	    }
-	}
+#if ( VTUNE>=1 )
+			__itt_pause();
+#endif
 
-	/* Every process receives the count, but it is only useful on the
-	   diagonal processes.  */
-	MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm );
+			/* -----------------------------------------------------------
+			   Compute the internal nodes asynchronously by all processes.
+			   ----------------------------------------------------------- */
 
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* local block number */
-		kcol = PCOL( k, grid ); /* root process in this row scope. */
-		if ( mycol == kcol ) { /* diagonal process */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+#ifdef _OPENMP
+#pragma omp parallel default (shared) 
+#endif
+			{	
+#ifdef _OPENMP
+#pragma omp master 
+#endif
+				{									 
+					for ( nfrecv =0; nfrecv=1 )
+						TIC(t1);
+						// msgcnt[1] = maxrecvsz;
+#endif	
+
+						recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz];
+
+						/* Receive a message. */
+						MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+								MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+						// MPI_Irecv(recvbuf0,maxrecvsz,SuperLU_MPI_DOUBLE_COMPLEX,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req);
+						// ready=0;
+						// while(ready==0){
+						// MPI_Test(&req,&ready,&status);
+						// #pragma omp taskyield
+						// }
+
+#if ( PROFlevel>=1 )		 
+						TOC(t2, t1);
+						stat_loc[thread_id]->utime[SOL_COMM] += t2;
+
+						msg_cnt += 1;
+						msg_vol += maxrecvsz * dword;			
+#endif					  
+
+						{  
+							
+							k = (*recvbuf0).r;
+		
 #if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
+							printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
 #endif
-		}
-	    }
-	}
 
-#else /* old */
+							if(status.MPI_TAG==BC_L){
+								// --nfrecvx;
+								nfrecvx_buf++;
+								{
+									lk = LBj( k, grid );    /* local block number */
+
+									if(BcTree_getDestCount(LBtree_ptr[lk],'z')>0){
+
+										BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,'z');	
+										// nfrecvx_buf++;
+									}
+
+									/*
+									 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+									 */	  
+
+									lk = LBj( k, grid ); /* Local block number, column-wise. */
+									lsub = Lrowind_bc_ptr[lk];
+									lusup = Lnzval_bc_ptr[lk];
+									if ( lsub ) {
+										krow = PROW( k, grid );
+										if(myrow==krow){
+											nb = lsub[0] - 1;
+											knsupc = SuperSize( k );
+											ii = X_BLK( LBi( k, grid ) );
+											xin = &x[ii];
+										}else{
+											nb   = lsub[0];
+											knsupc = SuperSize( k );
+											xin = &recvbuf0[XK_H] ;					
+										}
+
+										zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
+												fmod, nb, xsup, grid, Llu,
+												stat_loc,sizelsum,sizertemp,0);	
+
+									} /* if lsub */
+								}
+
+							}else if(status.MPI_TAG==RD_L){
+								// --nfrecvmod;		  
+								lk = LBi( k, grid ); /* Local block number, row-wise. */
+
+								knsupc = SuperSize( k );
+								tempv = &recvbuf0[LSUM_H];
+								il = LSUM_BLK( lk );		  
+								RHS_ITERATE(j) {
+									for (i = 0; i < knsupc; ++i)
+										z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
+											  &lsum[i + il + j*knsupc + thread_id*sizelsum],
+											  &tempv[i + j*knsupc]);
+										
+								}			
+
+								// #ifdef _OPENMP
+								// #pragma omp atomic capture
+								// #endif
+								fmod_tmp=--fmod[lk];
+								{
+									thread_id = 0;
+									rtemp_loc = &rtemp[sizertemp* thread_id];
+									if ( fmod_tmp==0 ) {	  
+										if(RdTree_IsRoot(LRtree_ptr[lk],'z')==YES){
+											// ii = X_BLK( lk );
+											knsupc = SuperSize( k );
+											for (ii=1;ii=1 )
+											TIC(t1);
+#endif			  
+
+											if(Llu->inv == 1){
+												Linv = Linv_bc_ptr[lk];		  
+#ifdef _CRAY
+												CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+														&alpha, Linv, &knsupc, &x[ii],
+														&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+												zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+														&alpha, Linv, &knsupc, &x[ii],
+														&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+												zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+														&alpha, Linv, &knsupc, &x[ii],
+														&knsupc, &beta, rtemp_loc, &knsupc );
+#endif			   
+												for (i=0 ; i=1 )
+											TOC(t2, t1);
+											stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif	
 
-	for (k = 0; k < nsupers; ++k) {
-	    krow = PROW( k, grid );
-	    if ( myrow == krow ) {
-		lk = LBi( k, grid );    /* Local block number. */
-		kcol = PCOL( k, grid ); /* Root process in this row scope. */
-		if ( mycol != kcol && bmod[lk] )
-		    i = 1;  /* Contribution from non-diagonal process. */
-		else i = 0;
-		MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
-			   MPI_SUM, kcol, scp->comm );
-		if ( mycol == kcol ) { /* Diagonal process. */
-		    nbrecvmod += brecv[lk];
-		    if ( !brecv[lk] && !bmod[lk] ) ++nroot;
+											stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs
+											+ 10 * knsupc * nrhs; /* complex division */
 #if ( DEBUGlevel>=2 )
-		    printf("(%2d) brecv[%4d]  %2d\n", iam, k, brecv[lk]);
-		    assert( brecv[lk] < Pc );
+											printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
+
+											/*
+											 * Send Xk to process column Pc[k].
+											 */						  
+											if(LBtree_ptr[lk]!=NULL){ 
+												BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z');
+											}		  
+
+
+											/*
+											 * Perform local block modifications.
+											 */
+											lk = LBj( k, grid ); /* Local block number, column-wise. */
+											lsub = Lrowind_bc_ptr[lk];
+											lusup = Lnzval_bc_ptr[lk];
+											if ( lsub ) {
+												krow = PROW( k, grid );
+												nb = lsub[0] - 1;
+												knsupc = SuperSize( k );
+												ii = X_BLK( LBi( k, grid ) );
+												xin = &x[ii];		
+												zlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k,
+														fmod, nb, xsup, grid, Llu,
+														stat_loc,sizelsum,sizertemp,0);	
+											} /* if lsub */
+											// }
+
+									}else{
+
+										il = LSUM_BLK( lk );		  
+										knsupc = SuperSize( k );
+
+										for (ii=1;ii=1 )
+		t = SuperLU_timer_() - t;
+		stat->utime[SOL_L] = t;
+		if ( !iam ) {
+			printf(".. L-solve time\t%8.4f\n", t);
+			fflush(stdout);
+		}
+
+
+		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
+				MPI_MAX, 0, grid->comm);
+		if ( !iam ) {
+			printf(".. L-solve time (MAX) \t%8.4f\n", tmax);	
+			fflush(stdout);
+		}	
+
+
+		t = SuperLU_timer_();
 #endif
-    }
 
-    /* Re-initialize lsum to zero. Each block header is already in place. */
-    for (k = 0; k < nsupers; ++k) {
-	krow = PROW( k, grid );
-	if ( myrow == krow ) {
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid );
-	    il = LSUM_BLK( lk );
-	    dest = &lsum[il];
-	    RHS_ITERATE(j) {
-		for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero;
-	    }
-	}
-    }
 
-    /* Set up additional pointers for the index and value arrays of U.
-       nub is the number of local block columns. */
-    nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
-    if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
-	ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
-					     blocks in a block column. */
-    Urbs1 = Urbs + nub;
-    if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
-        ABORT("Malloc fails for Ucb_indptr[]");
-    if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
-        ABORT("Malloc fails for Ucb_valptr[]");
-
-    /* Count number of row blocks in a block column. 
-       One pass of the skeleton graph of U. */
-    for (lk = 0; lk < nlb; ++lk) {
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    /* usub[0] -- number of column blocks in this block row. */
-#if ( DEBUGlevel>=2 )
-	    Ublocks += usub[0];
+#if ( DEBUGlevel==2 )
+		{
+			printf("(%d) .. After L-solve: y =\n", iam);
+			for (i = 0, k = 0; k < nsupers; ++k) {
+				krow = PROW( k, grid );
+				kcol = PCOL( k, grid );
+				if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
+					knsupc = SuperSize( k );
+					lk = LBi( k, grid );
+					ii = X_BLK( lk );
+					for (j = 0; j < knsupc; ++j)
+						printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
+					fflush(stdout);
+				}
+				MPI_Barrier( grid->comm );
+			}
+		}
 #endif
-	    i = BR_HEADER; /* Pointer in index array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];            /* Global block number */
-		++Urbs[LBj(k,grid)];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
-	}
-    }
 
-    /* Set up the vertical linked lists for the row blocks.
-       One pass of the skeleton graph of U. */
-    for (lb = 0; lb < nub; ++lb) {
-	if ( Urbs[lb] ) { /* Not an empty block column. */
-	    if ( !(Ucb_indptr[lb]
-		   = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
-		ABORT("Malloc fails for Ucb_indptr[lb][]");
-	    if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
-		ABORT("Malloc fails for Ucb_valptr[lb][]");
-	}
-    }
-    for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
-	usub = Ufstnz_br_ptr[lk];
-	if ( usub ) { /* Not an empty block row. */
-	    i = BR_HEADER; /* Pointer in index array. */
-	    j = 0;         /* Pointer in nzval array. */
-	    for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
-		k = usub[i];          /* Global block number, column-wise. */
-		ljb = LBj( k, grid ); /* Local block number, column-wise. */
-		Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
-		Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
-		Ucb_valptr[ljb][Urbs1[ljb]] = j;
-		++Urbs1[ljb];
-		j += usub[i+1];
-		i += UB_DESCRIPTOR + SuperSize( k );
-	    }
+		SUPERLU_FREE(fmod);
+		SUPERLU_FREE(frecv);
+		SUPERLU_FREE(leaf_send);
+		SUPERLU_FREE(leafsups);
+		SUPERLU_FREE(recvbuf_BC_fwd);
+
+		for (lk=0;lkcomm );
+
+#if ( VAMPIR>=1 )	
+		VT_traceoff();	
+		VT_finalize(); 
+#endif
+
+
+		/*---------------------------------------------------
+		 * Back solve Ux = y.
+		 *
+		 * The Y components from the forward solve is already
+		 * on the diagonal processes.
+		 *---------------------------------------------------*/
+		 
+		 
+		/* Save the count to be altered so it can be used by
+		   subsequent call to PDGSTRS. */
+		if ( !(bmod = intMalloc_dist(nlb)) )
+			ABORT("Calloc fails for bmod[].");
+		for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
+		if ( !(brecv = intCalloc_dist(nlb)) )
+			ABORT("Malloc fails for brecv[].");
+		Llu->brecv = brecv;
+
+		k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb;
+
+		/* Re-initialize lsum to zero. Each block header is already in place. */
+		
+#ifdef _OPENMP
+
+	#pragma omp parallel default(shared) private(thread_id,k,krow,knsupc,lk,il,dest,j,i)
+	{
+		thread_id = omp_get_thread_num ();
+		for (k = 0; k < nsupers; ++k) {
+			krow = PROW( k, grid );
+			if ( myrow == krow ) {
+				knsupc = SuperSize( k );
+				lk = LBi( k, grid );
+				il = LSUM_BLK( lk );
+				dest = &lsum[il];
+					
+				RHS_ITERATE(j) {
+					for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + thread_id*sizelsum] = zero;
+				}	
+			}
+		}	
+	}	
+
+#else	
+	for (k = 0; k < nsupers; ++k) {
+		krow = PROW( k, grid );
+		if ( myrow == krow ) {
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid );
+			il = LSUM_BLK( lk );
+			dest = &lsum[il];
+			
+			for (jj = 0; jj < num_thread; ++jj) {						
+				RHS_ITERATE(j) {
+					for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + jj*sizelsum] = zero;
+				}	
+			}	
+		}
 	}
-    }
+#endif		
 
 #if ( DEBUGlevel>=2 )
-    for (p = 0; p < Pr*Pc; ++p) {
-	if (iam == p) {
-	    printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("(%2d) Local col %2d: # row blocks %2d\n",
-		       iam, lb, Urbs[lb]);
-		if ( Urbs[lb] ) {
-		    for (i = 0; i < Urbs[lb]; ++i)
-			printf("(%2d) .. row blk %2d:\
-                               lbnum %d, indpos %d, valpos %d\n",
-			       iam, i, 
-			       Ucb_indptr[lb][i].lbnum,
-			       Ucb_indptr[lb][i].indpos,
-			       Ucb_valptr[lb][i]);
+		for (p = 0; p < Pr*Pc; ++p) {
+			if (iam == p) {
+				printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
+				for (lb = 0; lb < nub; ++lb) {
+					printf("(%2d) Local col %2d: # row blocks %2d\n",
+							iam, lb, Urbs[lb]);
+					if ( Urbs[lb] ) {
+						for (i = 0; i < Urbs[lb]; ++i)
+							printf("(%2d) .. row blk %2d:\
+									lbnum %d, indpos %d, valpos %d\n",
+									iam, i, 
+									Ucb_indptr[lb][i].lbnum,
+									Ucb_indptr[lb][i].indpos,
+									Ucb_valptr[lb][i]);
+					}
+				}
+			}
+			MPI_Barrier( grid->comm );
+		}
+		for (p = 0; p < Pr*Pc; ++p) {
+			if ( iam == p ) {
+				printf("\n(%d) bsendx_plist[][]", iam);
+				for (lb = 0; lb < nub; ++lb) {
+					printf("\n(%d) .. local col %2d: ", iam, lb);
+					for (i = 0; i < Pr; ++i)
+						printf("%4d", bsendx_plist[lb][i]);
+				}
+				printf("\n");
+			}
+			MPI_Barrier( grid->comm );
 		}
-	    }
-	}
-	MPI_Barrier( grid->comm );
-    }
-    for (p = 0; p < Pr*Pc; ++p) {
-	if ( iam == p ) {
-	    printf("\n(%d) bsendx_plist[][]", iam);
-	    for (lb = 0; lb < nub; ++lb) {
-		printf("\n(%d) .. local col %2d: ", iam, lb);
-		for (i = 0; i < Pr; ++i)
-		    printf("%4d", bsendx_plist[lb][i]);
-	    }
-	    printf("\n");
-	}
-	MPI_Barrier( grid->comm );
-    }
 #endif /* DEBUGlevel */
 
 
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
-    t = SuperLU_timer_();
-#endif
 
-    /*
-     * Solve the roots first by all the diagonal processes.
-     */
+
+	/* ---------------------------------------------------------
+	   Initialize the async Bcast trees on all processes.
+	   --------------------------------------------------------- */		
+	nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+	nbtree = 0;
+	for (lk=0;lk0)nbrecvx_buf++;				  
+			}
+			BcTree_allocateRequest(UBtree_ptr[lk],'z');
+		}
+	}
+
+	nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	if ( !(	rootsups = (int_t*)intCalloc_dist(nsupers_i)) )
+		ABORT("Calloc fails for rootsups.");
+
+	nrtree = 0;
+	nroot=0;
+	for (lk=0;lknprow;  /* not sure */
+			if(gb=2 )
-    printf("(%2d) nroot %4d\n", iam, nroot);
+	printf("(%2d) nbrecvx %4d,  nbrecvmod %4d,  nroot %4d\n,  nbtree %4d\n,  nrtree %4d\n",
+			iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree);
+	fflush(stdout);
 #endif
-    for (k = nsupers-1; k >= 0 && nroot; --k) {
-	krow = PROW( k, grid );
-	kcol = PCOL( k, grid );
-	if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
-	    knsupc = SuperSize( k );
-	    lk = LBi( k, grid ); /* Local block number, row-wise. */
-	    if ( brecv[lk]==0 && bmod[lk]==0 ) {
-		bmod[lk] = -1;       /* Do not solve X[k] in the future. */
-		ii = X_BLK( lk );
-		lk = LBj( k, grid ); /* Local block number, column-wise */
-		lsub = Lrowind_bc_ptr[lk];
-		lusup = Lnzval_bc_ptr[lk];
-		nsupr = lsub[1];
-#ifdef _CRAY
-		CTRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
-		      lusup, &nsupr, &x[ii], &knsupc);
-#elif defined (USE_VENDOR_BLAS)
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1);
-#else
-		ztrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, 
-		       lusup, &nsupr, &x[ii], &knsupc);
-#endif
-		stat->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-		    + 10 * knsupc * nrhs; /* complex division */
-		--nroot;
-#if ( DEBUGlevel>=2 )
-		printf("(%2d) Solve X[%2d]\n", iam, k);
+
+
+#if ( PRNTlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t);
+	fflush(stdout);
+	MPI_Barrier( grid->comm );	
+	t = SuperLU_timer_();
 #endif
+
 		/*
-		 * Send Xk to process column Pc[k].
+		 * Solve the roots first by all the diagonal processes.
 		 */
-		for (p = 0; p < Pr; ++p) {
-		    if ( bsendx_plist[lk][p] != EMPTY ) {
-			pi = PNUM( p, kcol, grid );
+#if ( DEBUGlevel>=2 )
+		printf("(%2d) nroot %4d\n", iam, nroot);
+		fflush(stdout);				
+#endif
+		
+		
 
-			MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                   SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                   &send_req[Llu->SolveMsgSent++]);
-#if 0
-			MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                  SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
-                                  grid->comm );
+#ifdef _OPENMP
+#pragma omp parallel default (shared) 
 #endif
-#if ( DEBUGlevel>=2 )
-			printf("(%2d) Sent X[%2.0f] to P %2d\n",
-			       iam, x[ii-XK_H], pi);
+	{	
+#ifdef _OPENMP
+#pragma omp master
 #endif
-		    }
-		}
-		/*
-		 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
-		 */
-		if ( Urbs[lk] ) 
-		    zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-			       Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-			       send_req, stat);
-	    } /* if root ... */
-	} /* if diagonal process ... */
-    } /* for k ... */
+		{
+#ifdef _OPENMP
+#pragma	omp	taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup	
+#endif		
+		for (jj=0;jj=1 )
+			TIC(t1);
+#endif	
+#ifdef _OPENMP
+			thread_id = omp_get_thread_num ();
+#else
+			thread_id = 0;
+#endif
+			rtemp_loc = &rtemp[sizertemp* thread_id];
 
 
-    /*
-     * Compute the internal nodes asychronously by all processes.
-     */
-    while ( nbrecvx || nbrecvmod ) { /* While not finished. */
+			
+			knsupc = SuperSize( k );
+			lk = LBi( k, grid ); /* Local block number, row-wise. */		
+
+			// bmod[lk] = -1;       /* Do not solve X[k] in the future. */
+			ii = X_BLK( lk );
+			lk = LBj( k, grid ); /* Local block number, column-wise */
+			lsub = Lrowind_bc_ptr[lk];
+			lusup = Lnzval_bc_ptr[lk];
+			nsupr = lsub[1];
+
 
-	/* Receive a message. */
-	MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
-                  MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );
-        k = (*recvbuf).r;
+			if(Llu->inv == 1){
+
+				Uinv = Uinv_bc_ptr[lk];
+#ifdef _CRAY
+				CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+				zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+				zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+						&alpha, Uinv, &knsupc, &x[ii],
+						&knsupc, &beta, rtemp_loc, &knsupc );
+#endif			   
+
+				for (i=0 ; i=1 )
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif	
+			stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+			+ 10 * knsupc * nrhs; /* complex division */
 
 #if ( DEBUGlevel>=2 )
-	printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+			printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
 
-	switch ( status.MPI_TAG ) {
-	    case Xk:
-	        --nbrecvx;
-		lk = LBj( k, grid ); /* Local block number, column-wise. */
+			/*
+			 * Send Xk to process column Pc[k].
+			 */
+
+			if(UBtree_ptr[lk]!=NULL){ 
+#ifdef _OPENMP
+#pragma omp atomic capture
+#endif
+				nroot_send_tmp = ++nroot_send;
+				root_send[nroot_send_tmp-1] = lk;
+				
+				// lib = LBi( k, grid ); /* Local block number, row-wise. */
+				// ii = X_BLK( lib );				
+				// BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],'z');
+			}
+
+			/*
+			 * Perform local block modifications: lsum[i] -= U_i,k * X[k]
+			 */
+			if ( Urbs[lk] ) 
+				zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2, 
+						Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+						send_req, stat_loc, root_send, &nroot_send, sizelsum,sizertemp);
+									
+		} /* for k ... */
+	}
+}
+
+
+for (i=0;i=0){ // this is a bcast forwarding
+		gb = mycol+lk*grid->npcol;  /* not sure */
+		lib = LBi( gb, grid ); /* Local block number, row-wise. */
+		ii = X_BLK( lib );			
+		BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],'z');
+	}else{ // this is a reduce forwarding
+		lk = -lk - 1;
+		il = LSUM_BLK( lk );
+		RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],'z');
+	}
+}
+
+
 		/*
-		 * Perform local block modifications:
-		 *         lsum[i] -= U_i,k * X[k]
+		 * Compute the internal nodes asychronously by all processes.
 		 */
-		zlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
-			   Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
-			   send_req, stat);
 
-	        break;
+#ifdef _OPENMP
+#pragma omp parallel default (shared) 
+#endif
+	{	
+#ifdef _OPENMP
+#pragma omp master 
+#endif		 
+		for ( nbrecv =0; nbrecv=1 )
+			TIC(t1);
+#endif	
+
+			recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz];
+
+			/* Receive a message. */
+			MPI_Recv( recvbuf0, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX,
+					MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status );	 	
+
+#if ( PROFlevel>=1 )		 
+			TOC(t2, t1);
+			stat_loc[thread_id]->utime[SOL_COMM] += t2;
+
+			msg_cnt += 1;
+			msg_vol += maxrecvsz * dword;			
+#endif	
+		 
+			k = (*recvbuf0).r;
+#if ( DEBUGlevel>=2 )
+			printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
+			fflush(stdout);
+#endif
+
+			if(status.MPI_TAG==BC_U){
+				// --nfrecvx;
+				nbrecvx_buf++;
+				
+				lk = LBj( k, grid );    /* local block number */
+
+				if(BcTree_getDestCount(UBtree_ptr[lk],'z')>0){
+
+					BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,'z');	
+					// nfrecvx_buf++;
+				}
+
+				/*
+				 * Perform local block modifications: lsum[i] -= L_i,k * X[k]
+				 */	  
+
+				lk = LBj( k, grid ); /* Local block number, column-wise. */
+				zlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,Urbs2,
+						Ucb_indptr, Ucb_valptr, xsup, grid, Llu, 
+						send_req, stat_loc, sizelsum,sizertemp);
+			}else if(status.MPI_TAG==RD_U){
+
+				lk = LBi( k, grid ); /* Local block number, row-wise. */
+				
+				knsupc = SuperSize( k );
+				tempv = &recvbuf0[LSUM_H];
+				il = LSUM_BLK( lk );		  
+				RHS_ITERATE(j) {
+					for (i = 0; i < knsupc; ++i)
+						z_add(&lsum[i + il + j*knsupc + thread_id*sizelsum],
+							  &lsum[i + il + j*knsupc + thread_id*sizelsum],
+							  &tempv[i + j*knsupc]);
+							
+				}					
+			// #ifdef _OPENMP
+			// #pragma omp atomic capture
+			// #endif
+				bmod_tmp=--bmod[lk];
+				thread_id = 0;									
+				rtemp_loc = &rtemp[sizertemp* thread_id];
+				if ( bmod_tmp==0 ) {
+					if(RdTree_IsRoot(URtree_ptr[lk],'z')==YES){							
+						
+						knsupc = SuperSize( k );
+						for (ii=1;iiinv == 1){
+
+							Uinv = Uinv_bc_ptr[lk];
 
-	    case LSUM: /* Receiver must be a diagonal process */
-		--nbrecvmod;
-		lk = LBi( k, grid ); /* Local block number, row-wise. */
-		ii = X_BLK( lk );
-		knsupc = SuperSize( k );
-		tempv = &recvbuf[LSUM_H];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i)
-                        z_add(&x[i + ii + j*knsupc],
-			      &x[i + ii + j*knsupc],
-			      &tempv[i + j*knsupc]);
-		}
+#ifdef _CRAY
+							CGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc,
+									&alpha, Uinv, &knsupc, &x[ii],
+									&knsupc, &beta, rtemp_loc, &knsupc );
+#elif defined (USE_VENDOR_BLAS)
+							zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+									&alpha, Uinv, &knsupc, &x[ii],
+									&knsupc, &beta, rtemp_loc, &knsupc, 1, 1 );
+#else
+							zgemm_( "N", "N", &knsupc, &nrhs, &knsupc,
+									&alpha, Uinv, &knsupc, &x[ii],
+									&knsupc, &beta, rtemp_loc, &knsupc );
+#endif		
 
-		if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
-		    bmod[lk] = -1; /* Do not solve X[k] in the future. */
-		    lk = LBj( k, grid ); /* Local block number, column-wise. */
-		    lsub = Lrowind_bc_ptr[lk];
-		    lusup = Lnzval_bc_ptr[lk];
-		    nsupr = lsub[1];
+							for (i=0 ; iops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
-			+ 10 * knsupc * nrhs; /* complex division */
+						}
+
+#if ( PROFlevel>=1 )
+							TOC(t2, t1);
+							stat_loc[thread_id]->utime[SOL_TRSM] += t2;
+#endif	
+							stat_loc[thread_id]->ops[SOLVE] += 4 * knsupc * (knsupc + 1) * nrhs
+							+ 10 * knsupc * nrhs; /* complex division */
+		
 #if ( DEBUGlevel>=2 )
-		    printf("(%2d) Solve X[%2d]\n", iam, k);
+						printf("(%2d) Solve X[%2d]\n", iam, k);
 #endif
-		    /*
-		     * Send Xk to process column Pc[k].
-		     */
-		    kcol = PCOL( k, grid );
-		    for (p = 0; p < Pr; ++p) {
-			if ( bsendx_plist[lk][p] != EMPTY ) {
-			    pi = PNUM( p, kcol, grid );
-
-			    MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                       SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm,
-                                       &send_req[Llu->SolveMsgSent++] );
-#if 0
-			    MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
-                                      SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk,
-                                      grid->comm );
+
+						/*
+						 * Send Xk to process column Pc[k].
+						 */						
+						if(UBtree_ptr[lk]!=NULL){ 
+							BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],'z');
+						}							
+						
+
+						/*
+						 * Perform local block modifications: 
+						 *         lsum[i] -= U_i,k * X[k]
+						 */
+						if ( Urbs[lk] )
+							zlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2,
+									Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
+									send_req, stat_loc, sizelsum,sizertemp);
+
+					}else{
+						il = LSUM_BLK( lk );		  
+						knsupc = SuperSize( k );
+
+						for (ii=1;ii=1 )
+		t = SuperLU_timer_() - t;
+		if ( !iam ) printf(".. U-solve time\t%8.4f\n", t);
+		MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE,
+				MPI_MAX, 0, grid->comm);
+		if ( !iam ) {
+			printf(".. U-solve time (MAX) \t%8.4f\n", tmax);	
+			fflush(stdout);
+		}			
+		t = SuperLU_timer_();			
 #endif
+
+
+
+
 #if ( DEBUGlevel>=2 )
-			    printf("(%2d) Sent X[%2.0f] to P %2d\n",
-				   iam, x[ii - XK_H], pi);
+		{
+			double *x_col;
+			int diag;
+			printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
+			ii = 0;
+			for (k = 0; k < nsupers; ++k) {
+				knsupc = SuperSize( k );
+				krow = PROW( k, grid );
+				kcol = PCOL( k, grid );
+				diag = PNUM( krow, kcol, grid);
+				if ( iam == diag ) { /* Diagonal process. */
+					lk = LBi( k, grid );
+					jj = X_BLK( lk );
+					x_col = &x[jj];
+					RHS_ITERATE(j) {
+						for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
+							printf("\t(%d)\t%4d\t%.10f\n",
+									iam, xsup[k]+i, x_col[i]);
+						}
+						x_col += knsupc;
+					}
+				}
+				ii += knsupc;
+			} /* for k ... */
+		}
 #endif
-			}
-		    }
-		    /*
-		     * Perform local block modifications: 
-		     *         lsum[i] -= U_i,k * X[k]
-		     */
-		    if ( Urbs[lk] )
-			zlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
-				   Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
-				   send_req, stat);
-		} /* if becomes solvable */
-		
-		break;
 
-#if ( DEBUGlevel>=2 )
-	      default:
-		printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG);
-		break;
-#endif		
+		pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
+				ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
-	} /* switch */
 
-    } /* while not finished ... */
+#if ( PRNTlevel>=1 )
+		t = SuperLU_timer_() - t;
+		if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t);
+		t = SuperLU_timer_();
+#endif	
 
-#if ( PRNTlevel>=3 )
-    t = SuperLU_timer_() - t;
-    if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
-#endif
 
-#if ( DEBUGlevel>=2 )
-    {
-	doublecomplex *x_col;
-	int diag;
-	printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam);
-	ii = 0;
-	for (k = 0; k < nsupers; ++k) {
-	    knsupc = SuperSize( k );
-	    krow = PROW( k, grid );
-	    kcol = PCOL( k, grid );
-	    diag = PNUM( krow, kcol, grid);
-	    if ( iam == diag ) { /* Diagonal process. */
-		lk = LBi( k, grid );
-		jj = X_BLK( lk );
-		x_col = &x[jj];
-		RHS_ITERATE(j) {
-		    for (i = 0; i < knsupc; ++i) { /* X stored in blocks */
-			printf("\t(%d)\t%4d\t%.10f\n",
-			       iam, xsup[k]+i, x_col[i]);
-		    }
-		    x_col += knsupc;
+		double tmp1=0; 
+		double tmp2=0;
+		double tmp3=0;
+		double tmp4=0;
+		for(i=0;iutime[SOL_TRSM]);
+			tmp2 = MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]);
+			tmp3 = MAX(tmp3,stat_loc[i]->utime[SOL_COMM]);
+			tmp4 += stat_loc[i]->ops[SOLVE];
+#if ( PRNTlevel>=2 )
+			if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]);
+#endif	
 		}
-	    }
-	    ii += knsupc;
-	} /* for k ... */
-    }
-#endif
 
-    pzReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum,
-			  ScalePermstruct, Glu_persist, grid, SOLVEstruct);
 
+		stat->utime[SOL_TRSM] += tmp1;
+		stat->utime[SOL_GEMM] += tmp2;
+		stat->utime[SOL_COMM] += tmp3;
+		stat->ops[SOLVE]+= tmp4;	  
 
-    /* Deallocate storage. */
-    SUPERLU_FREE(lsum);
-    SUPERLU_FREE(x);
-    SUPERLU_FREE(recvbuf);
-    for (i = 0; i < nub; ++i) {
-	if ( Urbs[i] ) {
-	    SUPERLU_FREE(Ucb_indptr[i]);
-	    SUPERLU_FREE(Ucb_valptr[i]);
-	}
-    }
-    SUPERLU_FREE(Ucb_indptr);
-    SUPERLU_FREE(Ucb_valptr);
-    SUPERLU_FREE(Urbs);
-    SUPERLU_FREE(bmod);
-    SUPERLU_FREE(brecv);
 
-    /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
+		/* Deallocate storage. */
+		SUPERLU_FREE(stat_loc);
+		SUPERLU_FREE(rtemp);
+		SUPERLU_FREE(lsum);
+		SUPERLU_FREE(x);
+		
+		
+		SUPERLU_FREE(bmod);
+		SUPERLU_FREE(brecv);
+		SUPERLU_FREE(root_send);
+		
+		SUPERLU_FREE(rootsups);
+		SUPERLU_FREE(recvbuf_BC_fwd);		
+		
+		for (lk=0;lkcomm );
+
+		/*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/
 
-    for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status);
-    SUPERLU_FREE(send_req);
 
-    MPI_Barrier( grid->comm );
+#if ( PROFlevel>=2 )
+		{
+			float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
+
+			MPI_Reduce (&msg_cnt, &msg_cnt_sum,
+					1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+			MPI_Reduce (&msg_cnt, &msg_cnt_max,
+					1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+			MPI_Reduce (&msg_vol, &msg_vol_sum,
+					1, MPI_FLOAT, MPI_SUM, 0, grid->comm);
+			MPI_Reduce (&msg_vol, &msg_vol_max,
+					1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
+			if (!iam) {
+				printf ("\tPDGSTRS comm stat:"
+						"\tAvg\tMax\t\tAvg\tMax\n"
+						"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
+						msg_cnt_sum / Pr / Pc, msg_cnt_max,
+						msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6);
+			}
+		}
+#endif	
 
-    stat->utime[SOLVE] = SuperLU_timer_() - t;
+		TOC(t2_sol,t1_sol);
+		stat->utime[SOLVE] = t2_sol;
 
 #if ( DEBUGlevel>=1 )
-    CHECK_MALLOC(iam, "Exit pzgstrs()");
+		CHECK_MALLOC(iam, "Exit pzgstrs()");
 #endif
 
-    return;
-} /* PZGSTRS */
+		return;
+	} /* PZGSTRS */
 
diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c
index 23bd35e1..d4e9e2fc 100644
--- a/SRC/pzgstrs_lsum.c
+++ b/SRC/pzgstrs_lsum.c
@@ -24,6 +24,7 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
+#include "superlu_defs.h"
 
 #define ISEND_IRECV
 
@@ -74,7 +75,7 @@ void zlsum_fmod
     doublecomplex *lusup, *lusup1;
     doublecomplex *dest;
     int    iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi;
-    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, rel;
+    int_t  i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel;
     int_t  *lsub, *lsub1, nlb1, lptr1, luptr1;
     int_t  *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum.   */
     int_t  *frecv = Llu->frecv;
@@ -82,6 +83,14 @@ void zlsum_fmod
     MPI_Status status;
     int test_flag;
 
+#if ( PROFlevel>=1 )
+	double t1, t2;
+	float msg_vol = 0, msg_cnt = 0;
+#endif 
+#if ( PROFlevel>=1 )
+	TIC(t1);
+#endif	
+	
     iam = grid->iam;
     myrow = MYROW( iam, grid );
     lk = LBj( k, grid ); /* Local block number, column-wise. */
@@ -121,7 +130,12 @@ void zlsum_fmod
 		      &rtemp[i + j*nbrow]);
 	}
 	luptr += nbrow;
-		    
+
+#if ( PROFlevel>=1 )
+		TOC(t2, t1);
+		stat->utime[SOL_GEMM] += t2;
+#endif		
+	
 	if ( (--fmod[lk])==0 ) { /* Local accumulation done. */
 	    ikcol = PCOL( ik, grid );
 	    p = PNUM( myrow, ikcol, grid );
@@ -156,6 +170,9 @@ void zlsum_fmod
 		    lsub1 = Llu->Lrowind_bc_ptr[lk];
 		    lusup1 = Llu->Lnzval_bc_ptr[lk];
 		    nsupr1 = lsub1[1];
+#if ( PROFlevel>=1 )
+			TIC(t1);
+#endif				
 #ifdef _CRAY
 		    CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha,
 			  lusup1, &nsupr1, &x[ii], &iknsupc);
@@ -166,6 +183,11 @@ void zlsum_fmod
 		    ztrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, 
 			   lusup1, &nsupr1, &x[ii], &iknsupc);
 #endif
+#if ( PROFlevel>=1 )
+			TOC(t2, t1);
+			stat->utime[SOL_TRSM] += t2;
+#endif	
+
 		    stat->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs
 			+ 10 * knsupc * nrhs; /* complex division */
 #if ( DEBUGlevel>=2 )
@@ -241,7 +263,7 @@ void zlsum_bmod
  * =======
  *   Perform local block modifications: lsum[i] -= U_i,k * X[k].
  */
-    doublecomplex alpha = {1.0, 0.0};
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
     int    iam, iknsupc, knsupc, myrow, nsupr, p, pi;
     int_t  fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow,
            j, jj, lk, lk1, nub, ub, uptr;
@@ -383,3 +405,1771 @@ void zlsum_bmod
 
 } /* zlSUM_BMOD */
 
+
+
+/************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void zlsum_fmod_inv +/************************************************************************/ +( + doublecomplex *lsum, /* Sum of local modifications. */ + doublecomplex *x, /* X array (local) */ + doublecomplex *xk, /* X[k]. */ + doublecomplex *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + SuperLUStat_t **stat, + int_t *leaf_send, + int_t *nleaf_send, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel +) +{ + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0},malpha={-1.0, 0.0}; + doublecomplex *lusup, *lusup1; + doublecomplex *dest; + doublecomplex *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id,thread_id1,num_thread; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + doublecomplex *rtemp_loc; + int_t ldalsum,maxsuper,aln_d; + int dword = sizeof (double); + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + + maxsuper = sp_ienv_dist(3); +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lsub = Llu->Lrowind_bc_ptr[lk]; + + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>8*maxsuper){ + // if(m<1){ + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + #ifdef _CRAY + CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else + zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id1], + &lsum[il+irow + j*iknsupc+sizelsum*thread_id1], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + for (lb=lbstart;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; + + +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); + +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[nleaf_send_tmp-1] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + + nlb1 = lsub1[0] - 1; + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + + } + } + + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + zgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } + nbrow_ref+=nbrow1; + } + + // TOC(t3, t1); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + + thread_id1 = omp_get_thread_num (); + rtemp_loc = &rtemp[sizertemp* thread_id1]; + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; +#endif + + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[nleaf_send_tmp-1] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + + { + nlb1 = lsub1[0] - 1; + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } +} + + stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; + + + +} /* if nlb>0*/ +} /* zLSUM_FMOD_INV */ + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void zlsum_fmod_inv_master +/************************************************************************/ +( + doublecomplex *lsum, /* Sum of local modifications. */ + doublecomplex *x, /* X array (local) */ + doublecomplex *xk, /* X[k]. */ + doublecomplex *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel +) +{ + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0},malpha={-1.0, 0.0}; + doublecomplex *lusup, *lusup1; + doublecomplex *dest; + doublecomplex *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id,thread_id1,num_thread; + int m; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + doublecomplex *rtemp_loc; + int_t ldalsum,maxsuper,aln_d; + int dword = sizeof (double); + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + + maxsuper = sp_ienv_dist(3); +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lsub = Llu->Lrowind_bc_ptr[lk]; + + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>4*maxsuper || nrhs>10){ + // if(m<1){ + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + #ifdef _CRAY + CGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else + zgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + z_sub(&lsum[il+irow + j*iknsupc], + &lsum[il+irow + j*iknsupc], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + } + + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + zgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } + nbrow_ref+=nbrow1; + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + } + // TOC(t3, t1); + thread_id1 = omp_get_thread_num (); + + + + + rtemp_loc = &rtemp[sizertemp* thread_id1]; + + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL) + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + nlb1 = lsub1[0] - 1; + + + zlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } + stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; + } /* if nlb>0*/ +} /* zLSUM_FMOD_INV */ + + + +/************************************************************************/ +void zlsum_bmod_inv +/************************************************************************/ +( + doublecomplex *lsum, /* Sum of local modifications. */ + doublecomplex *x, /* X array (local). */ + doublecomplex *xk, /* X[k]. */ + doublecomplex *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + int_t *Urbs2, + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t **stat, + int_t* root_send, + int_t* nroot_send, + int_t sizelsum, + int_t sizertemp + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + doublecomplex *uval, *dest, *y; + int_t *lsub; + doublecomplex *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id,thread_id1,num_thread; + doublecomplex *rtemp_loc; + int_t nroot_send_tmp; + doublecomplex *Uinv;/* Inverse of diagonal block */ + doublecomplex temp; + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + if(nub>num_thread){ + // // // // if(Urbs2[lk]>num_thread){ + // if(Urbs2[lk]>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup +#endif + for (nn=0;nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + { + zz_mult(&temp, &uval[uptr], &y[jj]); + z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow], + &temp); + ++uptr; + } + stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); + + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ + + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; iUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + + { + zz_mult(&temp, &uval[uptr], &y[jj]); + z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow], + &temp); + ++uptr; + } + stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; inum_thread){ + #ifdef _OPENMP + #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied + #endif + zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat, root_send, nroot_send, sizelsum,sizertemp); + }else{ + zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat, root_send, nroot_send, sizelsum,sizertemp); + } + + // } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ + } + +} /* zlSUM_BMOD_inv */ + + + +/************************************************************************/ +void zlsum_bmod_inv_master +/************************************************************************/ +( + doublecomplex *lsum, /* Sum of local modifications. */ + doublecomplex *x, /* X array (local). */ + doublecomplex *xk, /* X[k]. */ + doublecomplex *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + int_t *Urbs2, + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + doublecomplex *uval, *dest, *y; + int_t *lsub; + doublecomplex *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id,thread_id1,num_thread; + doublecomplex *rtemp_loc; + doublecomplex temp; + doublecomplex *Uinv;/* Inverse of diagonal block */ + + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + + + // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); + // fflush(stdout); + + if(nub>num_thread){ + // if(nub>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + + if(nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + { + zz_mult(&temp, &uval[uptr], &y[jj]); + z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow], + &temp); + ++uptr; + } + stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + }else{ +#ifdef _OPENMP + thread_id1 = omp_get_thread_num (); +#else + thread_id1 = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id1]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + { + zz_mult(&temp, &uval[uptr], &y[jj]); + z_sub(&dest[irow - ikfrow], &dest[irow - ikfrow], + &temp); + ++uptr; + } + stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + + +#ifdef _OPENMP + thread_id1 = omp_get_thread_num (); +#else + thread_id1 = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id1]; + for (ub = 0; ub < nub; ++ub){ + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; +#ifdef _CRAY + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; +#endif + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc + 1) * nrhs + + 10 * knsupc * nrhs; /* complex division */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i @@ -1186,19 +1190,21 @@ float zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Pslu_freeable_t *Pslu_freeable, - LUstruct_t *LUstruct, gridinfo_t *grid) + LUstruct_t *LUstruct, gridinfo_t *grid, int_t nrhs) { Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t Glu_freeable_n; LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, + int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, len, len1, nsupc, nsupc_gb, ii, nprocs; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ int_t ljb; /* local block column number */ int_t nrbl; /* number of L blocks in current block column */ int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; + int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p; int_t mybufmax[NBUFFERS]; NRformat_loc *Astore; doublecomplex *a; @@ -1206,7 +1212,7 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; doublecomplex *asup_val, *ainf_val; int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; int_t next_ind; /* next available position in index[*] */ int_t next_val; /* next available position in nzval[*] */ @@ -1216,10 +1222,26 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *recvBuf; int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + doublecomplex *lusup_srt; /* nonzero values in L and U */ doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + + /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; @@ -1246,10 +1268,28 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *LUb_number; /* global block number; size nsupers_ij */ int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - doublecomplex *dense, *dense_col; /* SPA */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; +doublecomplex *dense, *dense_col; /* SPA */ doublecomplex zero = {0.0, 0.0}; int_t ldaspa; /* LDA of SPA */ int_t iword, dword; + float mem_use = 0.0; + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; + int_t nub; + float memStrLU, memA, memDist = 0.; /* memory used for redistributing the data, which does not include the memory for the numerical values @@ -1261,6 +1301,10 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif /* Initialization. */ iam = grid->iam; @@ -1276,6 +1320,8 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, iword = sizeof(int_t); dword = sizeof(doublecomplex); + aln_i = ceil(CACHELINE/(double)iword); + if (fact == SamePattern_SameRowPerm) { ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); } @@ -1444,9 +1490,28 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); return (memDist + memNLU); } - memNLU += nsupers_j * sizeof(doublecomplex*) + nsupers_j * sizeof(int_t*); + + if ( !(Linv_bc_ptr = + (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + return (memDist + memNLU); + } + if ( !(Uinv_bc_ptr = + (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + return (memDist + memNLU); + } + if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){ + fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[]."); + return (memDist + memNLU); + } + + memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*); Lnzval_bc_ptr[nsupers_j-1] = NULL; Lrowind_bc_ptr[nsupers_j-1] = NULL; + Linv_bc_ptr[nsupers_j-1] = NULL; + Uinv_bc_ptr[nsupers_j-1] = NULL; + Lindval_loc_bc_ptr[nsupers_j-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { @@ -1734,7 +1799,17 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb); return (memDist + memNLU); } + + if (!(Linv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]"); + if (!(Uinv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]"); + memNLU += len1*iword + len*nsupc*dword; + + if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]"); + lusup = Lnzval_bc_ptr[ljb_j]; mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); @@ -1748,6 +1823,11 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, gb = LUb_number[k]; lb = LBi( gb, grid ); len = LUb_length[lb]; + + Lindval_loc_bc_ptr[ljb_j][k] = lb; + Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind; + Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val; + LUb_length[lb] = 0; index[next_ind++] = gb; /* Descriptor */ index[next_ind++] = len; @@ -1776,9 +1856,65 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, } } } /* for i ... */ + + + + /* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb_j][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb_j]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jjnpcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + + + + ///////////////////////////////////////////////////////////////// + + // if(LSUM=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'z'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'z')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + + + for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'z'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + + #if ( PRNTlevel>=1 ) + if(Root==mycol){ + assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + + // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); + // fflush(stdout); + + if(ibnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + } + } /* for i ... */ + pr = PROW( ib, grid ); // take care of diagonal node stored as L + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ljb = LBj( ib, grid ); /* local block number */ + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); + // fflush(stdout); + } + } + } + + // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); + // fflush(stdout); + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); + fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'z'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'z'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + if(ibnpcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } /* for i ... */ + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'z'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'z'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + /* Free the memory used for storing L and U */ + SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); + if (lsub != NULL) + SUPERLU_FREE(lsub); + if (usub != NULL) + SUPERLU_FREE(usub); + SUPERLU_FREE(nnzToRecv); SUPERLU_FREE(ptrToRecv); @@ -1930,7 +2748,10 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, SUPERLU_FREE(recvBuf); Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; Llu->Unzval_br_ptr = Unzval_br_ptr; Llu->ToRecv = ToRecv; @@ -1946,7 +2767,15 @@ zdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Llu->nbsendx = nbsendx; Llu->ilsum = ilsum; Llu->ldalsum = ldaspa; - LUstruct->Glu_persist = Glu_persist; + LUstruct->Glu_persist = Glu_persist; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", nLblocks, nUblocks); diff --git a/SRC/pzutil.c b/SRC/pzutil.c index 2fc49a5b..c919238f 100644 --- a/SRC/pzutil.c +++ b/SRC/pzutil.c @@ -537,3 +537,49 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx } } +/*! \brief Destroy distributed L & U matrices. */ +void +zDestroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter zDestroy_Tree()"); +#endif + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],'z'); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],'z'); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],'z'); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],'z'); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit zDestroy_Tree()"); +#endif +} \ No newline at end of file diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 0ba708d7..ec9fc892 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -21,8 +21,8 @@ at the top-level directory. *
*/ -#ifndef __SUPERLU_dDEFS /* allow multiple inclusions */ -#define __SUPERLU_dDEFS +#ifndef __SUPERLU_DDEFS /* allow multiple inclusions */ +#define __SUPERLU_DDEFS /* * File name: superlu_ddefs.h @@ -49,13 +49,13 @@ typedef struct { double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ - double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ - RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ #if 0 int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ double *Lval_buf; /* Buffer for the remote nonzeros of L */ @@ -100,6 +100,7 @@ typedef struct { int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ + /*********************/ /* The following variables are used in the hybrid solver */ @@ -219,11 +220,10 @@ extern int dcreate_matrix_rb(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); extern int dcreate_matrix_dat(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); - extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *, double **, int *, FILE *, char *, gridinfo_t *); - + /* Driver related */ extern void dgsequ_dist (SuperMatrix *, double *, double *, double *, double *, double *, int_t *); @@ -259,9 +259,7 @@ extern void pdgssvx(superlu_dist_options_t *, SuperMatrix *, ScalePermstruct_t *, double *, int, int, gridinfo_t *, LUstruct_t *, SOLVEstruct_t *, double *, SuperLUStat_t *, int *); - extern void pdCompute_Diag_Inv(int_t, LUstruct_t *,gridinfo_t *, SuperLUStat_t *, int *); - extern int dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [], int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *); extern void dSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *); @@ -297,6 +295,7 @@ extern void dlsum_bmod(double *, double *, double *, int, int_t, int_t *, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, LocalLU_t *, MPI_Request [], SuperLUStat_t *); + extern void dlsum_fmod_inv(double *, double *, double *, double *, int, int, int_t , int_t *, int_t, int_t *, gridinfo_t *, LocalLU_t *, @@ -312,7 +311,8 @@ extern void dlsum_bmod_inv(double *, double *, double *, double *, extern void dlsum_bmod_inv_master(double *, double *, double *, double *, int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t **, int_t, int_t); + MPI_Request [], SuperLUStat_t **, int_t, int_t); + extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, double [], int_t, double [], int_t, int, @@ -356,7 +356,8 @@ extern void dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *, extern void dreadMM_dist(FILE *, int_t *, int_t *, int_t *, double **, int_t **, int_t **); extern int dread_binary(FILE *, int_t *, int_t *, int_t *, - double **, int_t **, int_t **); + double **, int_t **, int_t **); + /* Distribute the data for numerical factorization */ extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *, ScalePermstruct_t *, Pslu_freeable_t *, @@ -388,16 +389,11 @@ extern void dtrsm_(char*, char*, char*, char*, int*, int*, int*, int, int, int, int); extern void dgemv_(char *, int *, int *, double *, double *a, int *, double *, int *, double *, double *, int *, int); +extern void dtrtri_(char*, char*, int*, double*, int*,int*); + extern void dger_(int*, int*, double*, double*, int*, double*, int*, double*, int*); -extern int daxpy_(int *, double *, double *, int *, double *, int *); - - -extern void dtrtri_(char*, char*, int*, double*, int*,int*); - - - #else extern int dgemm_(const char*, const char*, const int*, const int*, const int*, const double*, const double*, const int*, const double*, @@ -410,8 +406,6 @@ extern int dgemv_(char *, int *, int *, double *, double *a, int *, double *, int *, double *, double *, int *); extern void dger_(int*, int*, double*, double*, int*, double*, int*, double*, int*); - -extern int daxpy_(int *, double *, double *, int *, double *, int *); #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 74a872f9..7cda561c 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,12 +1,4 @@ -/* superlu_dist_config.h.in */ - -/* Enable parmetis */ -#define HAVE_PARMETIS TRUE - -/* enable 64bit index mode */ -/* #undef XSDK_INDEX_SIZE */ - +/* #define XSDK_INDEX_SIZE 64 */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 #endif - diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 10999c48..8122f43e 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -20,8 +20,8 @@ at the top-level directory. * */ -#ifndef __SUPERLU_zDEFS /* allow multiple inclusions */ -#define __SUPERLU_zDEFS +#ifndef __SUPERLU_ZDEFS /* allow multiple inclusions */ +#define __SUPERLU_ZDEFS /* * File name: superlu_zdefs.h @@ -46,8 +46,16 @@ typedef struct { typedef struct { int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ + int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ + doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ #if 0 int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ double *Lval_buf; /* Buffer for the remote nonzeros of L */ @@ -118,6 +126,7 @@ typedef struct { int_t n; int_t nleaf; int_t nfrecvmod; + int_t inv; /* whether the diagonal block is inverted*/ } LocalLU_t; @@ -211,7 +220,10 @@ extern int zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, gridinfo_t *); extern int zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, FILE *, gridinfo_t *); - +extern int zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *, + doublecomplex **, int *, FILE *, char *, gridinfo_t *); + + /* Driver related */ extern void zgsequ_dist (SuperMatrix *, double *, double *, double *, double *, double *, int_t *); @@ -242,11 +254,12 @@ extern void pzgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *, SuperLUStat_t *, int *); extern float pzdistribute(fact_t, int_t, SuperMatrix *, ScalePermstruct_t *, Glu_freeable_t *, - LUstruct_t *, gridinfo_t *); + LUstruct_t *, gridinfo_t *, int_t); extern void pzgssvx(superlu_dist_options_t *, SuperMatrix *, ScalePermstruct_t *, doublecomplex *, int, int, gridinfo_t *, LUstruct_t *, SOLVEstruct_t *, double *, SuperLUStat_t *, int *); +extern void pzCompute_Diag_Inv(int_t, LUstruct_t *,gridinfo_t *, SuperLUStat_t *, int *); extern int zSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [], int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *); extern void zSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *); @@ -262,6 +275,7 @@ extern int static_schedule(superlu_dist_options_t *, int, int, extern void LUstructInit(const int_t, LUstruct_t *); extern void LUstructFree(LUstruct_t *); extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *); +extern void zDestroy_Tree(int_t, gridinfo_t *, LUstruct_t *); /* #define GPU_PROF #define IPM_PROF */ @@ -281,6 +295,24 @@ extern void zlsum_bmod(doublecomplex *, doublecomplex *, doublecomplex *, int, int_t, int_t *, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, LocalLU_t *, MPI_Request [], SuperLUStat_t *); + +extern void zlsum_fmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, + int, int, int_t , int_t *, int_t, + int_t *, gridinfo_t *, LocalLU_t *, + SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t); +extern void zlsum_fmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, + int, int, int_t , int_t *, int_t, + int_t *, gridinfo_t *, LocalLU_t *, + SuperLUStat_t **, int_t, int_t, int_t); +extern void zlsum_bmod_inv(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, + int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **, + int_t **, int_t *, gridinfo_t *, LocalLU_t *, + MPI_Request [], SuperLUStat_t **, int_t *, int_t *, int_t, int_t); +extern void zlsum_bmod_inv_master(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, + int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **, + int_t **, int_t *, gridinfo_t *, LocalLU_t *, + MPI_Request [], SuperLUStat_t **, int_t, int_t); + extern void pzgsrfs(int_t, SuperMatrix *, double, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, doublecomplex [], int_t, doublecomplex [], int_t, int, @@ -325,11 +357,13 @@ extern void zreadrb_dist(int, FILE *, int_t *, int_t *, int_t *, doublecomplex **, int_t **, int_t **); extern void zreadMM_dist(FILE *, int_t *, int_t *, int_t *, doublecomplex **, int_t **, int_t **); - +extern int zread_binary(FILE *, int_t *, int_t *, int_t *, + doublecomplex **, int_t **, int_t **); + /* Distribute the data for numerical factorization */ extern float zdist_psymbtonum(fact_t, int_t, SuperMatrix *, ScalePermstruct_t *, Pslu_freeable_t *, - LUstruct_t *, gridinfo_t *); + LUstruct_t *, gridinfo_t *, int_t nrhs); extern void pzGetDiagU(int_t, LUstruct_t *, gridinfo_t *, doublecomplex *); @@ -358,6 +392,8 @@ extern void ztrsm_(char*, char*, char*, char*, int*, int*, int*, int, int, int, int); extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int); +extern void ztrtri_(char*, char*, int*, doublecomplex*, int*,int*); + extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*, int*); diff --git a/TEST/pdtest.c b/TEST/pdtest.c index 0a53573b..ec261a1b 100644 --- a/TEST/pdtest.c +++ b/TEST/pdtest.c @@ -25,14 +25,9 @@ at the top-level directory. */ #include #include -//#include -#ifdef _MSC_VER -#include -#else +#include #include -#endif #include -#include "superlu_dist_config.h" #include "superlu_ddefs.h" #define NTESTS 1 /*5*/ /* Number of test types */ @@ -318,6 +313,7 @@ int main(int argc, char *argv[]) options.Fact = fact; if ( fact == SamePattern ) { // {L,U} not re-used in subsequent call to PDGSSVX. + dDestroy_Tree(n, &grid, &LUstruct); Destroy_LU(n, &grid, &LUstruct); } @@ -392,7 +388,8 @@ int main(int argc, char *argv[]) ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */ } ScalePermstructFree(&ScalePermstruct); - Destroy_LU(n, &grid, &LUstruct); + dDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); @@ -450,8 +447,6 @@ parse_command_line(int argc, char *argv[], int *nprow, int *npcol, int c; extern char *optarg; char str[20]; - char *xenvstr, *menvstr, *benvstr, *genvstr; - xenvstr = menvstr = benvstr = genvstr = 0; while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) { switch (c) { @@ -474,44 +469,24 @@ parse_command_line(int argc, char *argv[], int *nprow, int *npcol, break; case 'n': *n = atoi(optarg); break; -// Use putenv as exists on Windows -#ifdef _MSC_VER -#define putenv _putenv -#endif - case 'x': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("NREL", str, 1); - xenvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(xenvstr, "NREL="); - strcat(xenvstr, optarg); - putenv(xenvstr); + case 'x': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("NREL", str, 1); //printf("Reset relax env. variable to %d\n", c); break; - case 'm': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("NSUP", str, 1); - menvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(menvstr, "NSUP="); - strcat(menvstr, optarg); - putenv(menvstr); + case 'm': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("NSUP", str, 1); //printf("Reset maxsuper env. variable to %d\n", c); break; - case 'b': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("FILL", str, 1); - benvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(benvstr, "FILL="); - strcat(benvstr, optarg); - putenv(benvstr); + case 'b': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("FILL", str, 1); //printf("Reset fill_ratio env. variable to %d\n", c); break; - case 'g': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("N_GEMM", str, 1); - genvstr = (char*) malloc((8+strlen(optarg))*sizeof(char)); - strcpy(genvstr, "N_GEMM="); - strcat(genvstr, optarg); - putenv(genvstr); + case 'g': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("N_GEMM", str, 1); //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c); break; case 's': *nrhs = atoi(optarg); diff --git a/TEST/pztest.c b/TEST/pztest.c index 3ba5439e..d168e832 100644 --- a/TEST/pztest.c +++ b/TEST/pztest.c @@ -24,14 +24,9 @@ at the top-level directory. */ #include #include -//#include -#ifdef _MSC_VER -#include -#else +#include #include -#endif #include -#include "superlu_dist_config.h" #include "superlu_zdefs.h" #define NTESTS 1 /*5*/ /* Number of test types */ @@ -317,6 +312,7 @@ int main(int argc, char *argv[]) options.Fact = fact; if ( fact == SamePattern ) { // {L,U} not re-used in subsequent call to PDGSSVX. + zDestroy_Tree(n, &grid, &LUstruct); Destroy_LU(n, &grid, &LUstruct); } @@ -391,7 +387,8 @@ int main(int argc, char *argv[]) ScalePermstruct.DiagScale = NOEQUIL; /* Avoid free R/C again. */ } ScalePermstructFree(&ScalePermstruct); - Destroy_LU(n, &grid, &LUstruct); + zDestroy_Tree(n, &grid, &LUstruct); + Destroy_LU(n, &grid, &LUstruct); LUstructFree(&LUstruct); if ( options.SolveInitialized ) { zSolveFinalize(&options, &SOLVEstruct); @@ -449,8 +446,6 @@ parse_command_line(int argc, char *argv[], int *nprow, int *npcol, int c; extern char *optarg; char str[20]; - char *xenvstr, *menvstr, *benvstr, *genvstr; - xenvstr = menvstr = benvstr = genvstr = 0; while ( (c = getopt(argc, argv, "hr:c:t:n:x:m:b:g:s:f:")) != EOF ) { switch (c) { @@ -473,44 +468,24 @@ parse_command_line(int argc, char *argv[], int *nprow, int *npcol, break; case 'n': *n = atoi(optarg); break; -// Use putenv as exists on Windows -#ifdef _MSC_VER -#define putenv _putenv -#endif - case 'x': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("NREL", str, 1); - xenvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(xenvstr, "NREL="); - strcat(xenvstr, optarg); - putenv(xenvstr); + case 'x': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("NREL", str, 1); //printf("Reset relax env. variable to %d\n", c); break; - case 'm': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("NSUP", str, 1); - menvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(menvstr, "NSUP="); - strcat(menvstr, optarg); - putenv(menvstr); + case 'm': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("NSUP", str, 1); //printf("Reset maxsuper env. variable to %d\n", c); break; - case 'b': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("FILL", str, 1); - benvstr = (char*) malloc((6+strlen(optarg))*sizeof(char)); - strcpy(benvstr, "FILL="); - strcat(benvstr, optarg); - putenv(benvstr); + case 'b': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("FILL", str, 1); //printf("Reset fill_ratio env. variable to %d\n", c); break; - case 'g': // c = atoi(optarg); - // sprintf(str, "%d", c); - // setenv("N_GEMM", str, 1); - genvstr = (char*) malloc((8+strlen(optarg))*sizeof(char)); - strcpy(genvstr, "N_GEMM="); - strcat(genvstr, optarg); - putenv(genvstr); + case 'g': c = atoi(optarg); + sprintf(str, "%d", c); + setenv("N_GEMM", str, 1); //printf("Reset min_gemm_gpu_offload env. variable to %d\n", c); break; case 's': *nrhs = atoi(optarg); diff --git a/build/batch_script_mpi_runit_pureOMP.sh b/build/batch_script_mpi_runit_pureOMP.sh deleted file mode 100644 index 8b97ea17..00000000 --- a/build/batch_script_mpi_runit_pureOMP.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# Bash script to submit many files to Cori/Edison/Queue - -EXIT_SUCCESS=0 -EXIT_HOST=1 -EXIT_PARAM=2 - -# MAX_PARAMS=1 -# # ^^^ This should be fixed, as it should just loop through everything -# if [[ $# -eq 0 ]]; then - # echo "Must have at least one parameter; exiting" - # exit $EXIT_PARAM -# fi -# if [[ $# -gt $MAX_PARAMS ]]; then - # echo "Too many parameters; exiting" - # exit $EXIT_PARAM -# fi - -# INPUT_FILE=$1 -# # ^^^ Get the input ile - -CUR_DIR=`pwd` -FILE_DIR=$CUR_DIR/EXAMPLE -INPUT_DIR=/project/projectdirs/sparse/liuyangz/my_research/matrix -FILE_NAME=pddrive -FILE=$FILE_DIR/$FILE_NAME - -TMP_BATCH_FILE=tmp_batch_file.slurm -# ^^^ Should check that this is not taken, -# but I would never use this, so ... - -> $TMP_BATCH_FILE - -if [[ $NERSC_HOST == edison ]]; then - CORES_PER_NODE=24 - THREADS_PER_NODE=48 -elif [[ $NERSC_HOST == cori ]]; then - CORES_PER_NODE=32 - THREADS_PER_NODE=64 - # This does not take hyperthreading into account -else - # Host unknown; exiting - exit $EXIT_HOST -fi - - -#nprows=(6 12 24) -#npcols=(6 12 24) - - -#nprows=(2048 1 32) -#npcols=(1 2048 64) - -# nprows=(32 ) -# npcols=(64 ) - - - -#nprows=(24 48 1 1 576 2304) -#npcols=(24 48 576 2304 1 1) - - -#nprows=(48 1 2304) -#npcols=(48 2304 1) - -#nprows=(6 12 24 48 ) -#npcols=(6 12 24 48 ) - -#nprows=(6 12 24 48 1 1 1 1 36 144 576 2304) -#npcols=(6 12 24 48 36 144 576 2304 1 1 1 1) - -#nprows=(32 128 512 1 1 1 4 8 16) -#npcols=(1 1 1 32 128 512 8 16 32) - -#nprows=(2048 1 32) -#npcols=(1 2048 64) - - - - -#nprows=(12 1 144) -#npcols=(12 144 1) - - -nprows=(1) -npcols=(1) - -for ((i = 0; i < ${#npcols[@]}; i++)); do -NROW=${nprows[i]} -NCOL=${npcols[i]} - -# NROW=36 -CORE_VAL=`expr $NCOL \* $NROW` -NODE_VAL=`expr $CORE_VAL / $CORES_PER_NODE` -MOD_VAL=`expr $CORE_VAL % $CORES_PER_NODE` -if [[ $MOD_VAL -ne 0 ]] -then - NODE_VAL=`expr $NODE_VAL + 1` -fi -#PARTITION=debug -PARTITION=regular -LICENSE=SCRATCH -TIME=00:20:00 - -if [[ $NERSC_HOST == edison ]] -then - CONSTRAINT=0 -fi -if [[ $NERSC_HOST == cori ]] -then - CONSTRAINT=haswell -fi - - -for NTH in 1 16 32 -do - -OMP_NUM_THREADS=$NTH -TH_PER_RANK=`expr $NTH \* 2` - - -#for NSUP in 128 64 32 16 8 -#do - # for MAT in atmosmodl.rb nlpkkt80.mtx torso3.mtx Ga19As19H42.mtx A22.mtx cage13.rb - # for MAT in torso3.mtx - # for MAT in matrix121.dat matrix211.dat tdr190k.dat tdr455k.dat nlpkkt80.mtx torso3.mtx helm2d03.mtx - # for MAT in tdr190k.dat Ga19As19H42.mtx - for MAT in hvdc2.mtx torso3.mtx matrix121.dat helm2d03.mtx - # for MAT in torso3.mtx -# for MAT in A22.bin DG_GrapheneDisorder_8192.bin DNA_715_64cell.bin LU_C_BN_C_4by2.bin Li4244.bin atmosmodj.bin Ga19As19H42.bin Geo_1438.bin StocF-1465.bin - -# for MAT in LU_C_BN_C_4by2.bin Li4244.bin atmosmodj.bin Ga19As19H42.bin Geo_1438.bin StocF-1465.bin - - -#for MAT in big.rua -#for MAT in torso3.bin -# for MAT in Ga19As19H42.mtx - do - # Start of looping stuff - > $TMP_BATCH_FILE - echo "#!/bin/bash -l" >> $TMP_BATCH_FILE - echo " " >> $TMP_BATCH_FILE - echo "#SBATCH -p $PARTITION" >> $TMP_BATCH_FILE - echo "#SBATCH -N $NODE_VAL" >> $TMP_BATCH_FILE - echo "#SBATCH -t $TIME" >> $TMP_BATCH_FILE - echo "#SBATCH -L $LICENSE" >> $TMP_BATCH_FILE - echo "#SBATCH -J SLU_$MAT" >> $TMP_BATCH_FILE - #echo "#SBATCH -o ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_flat_mrhs" >> $TMP_BATCH_FILE - #echo "#SBATCH -e ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_flat_mrhs" >> $TMP_BATCH_FILE - # echo "#SBATCH --mail-type=BEGIN" >> $TMP_BATCH_FILE - # echo "#SBATCH --mail-type=END" >> $TMP_BATCH_FILE - echo "#SBATCH --mail-user=liuyangzhuan@lbl.gov" >> $TMP_BATCH_FILE - if [[ $NERSC_HOST == cori ]] - then - echo "#SBATCH -C $CONSTRAINT" >> $TMP_BATCH_FILE - fi - mkdir -p $MAT - echo "export OMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export KMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export MKL_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export NSUP=128" >> $TMP_BATCH_FILE - echo "export NREL=20" >> $TMP_BATCH_FILE - echo "export OMP_PLACES=threads" >> $TMP_BATCH_FILE - echo "export OMP_PROC_BIND=spread" >> $TMP_BATCH_FILE - echo "export MPICH_MAX_THREAD_SAFETY=multiple" >> $TMP_BATCH_FILE - - echo " " >> $TMP_BATCH_FILE - echo "FILE=$FILE" >> $TMP_BATCH_FILE - echo "FILEMAT=$INPUT_DIR/$MAT" >> $TMP_BATCH_FILE - echo " " >> $TMP_BATCH_FILE - echo "CORE_VAL=$CORE_VAL" >> $TMP_BATCH_FILE - echo "NCOL=$NCOL" >> $TMP_BATCH_FILE - echo "NROW=$NROW" >> $TMP_BATCH_FILE - # This should be computed individually for each script... - - export OMP_NUM_THREADS=$OMP_NUM_THREADS - export OMP_PLACES=threads - export OMP_PROC_BIND=spread - export MPICH_MAX_THREAD_SAFETY=multiple - srun -n $CORE_VAL -c $TH_PER_RANK --cpu_bind=cores $FILE -c $NCOL -r $NROW $INPUT_DIR/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_OMP_${OMP_NUM_THREADS} - # Add final line (srun line) to temporary slurm script - - #cat $TMP_BATCH_FILE - #echo " " - # sbatch $TMP_BATCH_FILE - done -#one - -done -done - -exit $EXIT_SUCCESS - diff --git a/make.inc_good_static b/make.inc_good_static index e11d8899..6e200d53 100644 --- a/make.inc_good_static +++ b/make.inc_good_static @@ -15,7 +15,7 @@ # # The name of the libraries to be created/linked to # -SuperLUroot = /global/homes/l/liuyangz/Cori/my_research/github/superlu_dist_task_hybrid_whypddistslow_01_27_2018/build +SuperLUroot = ./build DSUPERLULIB = $(SuperLUroot)/SRC/libsuperlu_dist.a LIBS = $(DSUPERLULIB) /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a diff --git a/make.inc_tmp b/make.inc_tmp new file mode 100644 index 00000000..b85cf6a5 --- /dev/null +++ b/make.inc_tmp @@ -0,0 +1,40 @@ +############################################################################ +# +# Program: SuperLU_DIST +# +# Module: make.inc +# +# Purpose: Top-level Definitions +# +# Creation date: March 1, 2016 version 5.0.0 +# +# Modified: +# +# +############################################################################ +# +# The name of the libraries to be created/linked to +# +SuperLUroot = /global/homes/l/liuyangz/Cori/my_research/github/superlu_dist_trisolve_done_02_11_2018/build +DSUPERLULIB = $(SuperLUroot)/SRC/libsuperlu_dist.so + +LIBS = $(DSUPERLULIB) /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so /global/homes/l/liuyangz/Cori/my_software/parmetis-4.0.3_dynamic/build/Linux-x86_64/libparmetis/libparmetis.so /global/homes/l/liuyangz/Cori/my_software/parmetis-4.0.3_dynamic/build/Linux-x86_64/libmetis/libmetis.so + +# +# The archiver and the flag(s) to use when building archive (library) +# If your system has no ranlib, set RANLIB = echo. +# +ARCH = /usr/bin/ar +ARCHFLAGS = cr +RANLIB = /usr/bin/ranlib + +CC = /opt/cray/pe/craype/2.5.12/bin/cc +CFLAGS = -O3 -DNDEBUG -I/global/homes/l/liuyangz/Cori/my_software/parmetis-4.0.3_dynamic/metis/include -I/global/homes/l/liuyangz/Cori/my_software/parmetis-4.0.3_dynamic/include -DUSE_VENDOR_BLAS -qopenmp -g -O0 -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 +# CFLAGS += -D_LONGINT ## 64-bit integer +# CFLAGS += -D +# CFLAGS += +NOOPTS = -O0 +FORTRAN = /opt/cray/pe/craype/2.5.12/bin/ftn + +LOADER = $(CC) +LOADOPTS = -Wl,-rpath,-qopenmp -qopenmp