diff --git a/CMakeLists.txt b/CMakeLists.txt index e70b37b4..42b2deff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,10 +180,10 @@ if (enable_parmetislib) ## want to use parmetis # fix up PARMETIS library names string (REPLACE ";" " " PARMETIS_LIB_STR "${PARMETIS_LIB}") set(PARMETIS_LIB_EXPORT ${PARMETIS_LIB_STR}) - set(VTUNE_LIB_EXPORT "dfdfdfddf") + else() message("-- Will not link with ParMETIS.") -endif() +endif() # if(NOT enable_parmetislib) # find_package(PARMETIS) ## does not have this Module yet. diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 907c4ec6..f68baa41 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -133,7 +133,8 @@ pzdrive4_ABglobal: $(ZEXMG4) $(DSUPERLULIB) # $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c pdgstrf.c $(VERBOSE) .c.o: $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE) - +.cpp.o: + $(CPP) $(CPPFLAGS) $(CDEFS) $(BLASDEF) $(INCLUDEDIR) -c $< $(VERBOSE) .f.o: $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) diff --git a/EXAMPLE/dcreate_matrix_perturbed.c b/EXAMPLE/dcreate_matrix_perturbed.c index d4ea5e13..4c6ae016 100644 --- a/EXAMPLE/dcreate_matrix_perturbed.c +++ b/EXAMPLE/dcreate_matrix_perturbed.c @@ -228,3 +228,192 @@ int dcreate_matrix_perturbed(SuperMatrix *A, int nrhs, double **rhs, #endif return 0; } + + + +int dcreate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, char *postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0] *= 0.01; + nzval[nnz-1] *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doubleMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} \ No newline at end of file diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index cf115f36..4095ac78 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -65,15 +65,26 @@ int main(int argc, char *argv[]) FILE *fp, *fopen(); int cpp_defs(); int ii; + int omp_mpi_level; nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ - + nrhs =1; /* Number of right-hand side. */ + /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); + //MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( VTUNE>=1 ) + __itt_pause(); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { @@ -105,6 +116,28 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + if(grid.iam==0){ + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; @@ -164,14 +197,13 @@ int main(int argc, char *argv[]) options.ReplaceTinyPivot = NO; #endif - - //options.ParSymbFact = YES; - //options.ColPerm = PARMETIS; +// options.ParSymbFact = YES; +// options.ColPerm = PARMETIS; +// options.RowPerm = NOROWPERM; options.IterRefine = 0; - options.DiagInv = YES; - options.RowPerm = NOROWPERM; - //options.ReplaceTinyPivot = NO; - options.SymPattern = YES; +// options.DiagInv = YES; + options.ReplaceTinyPivot = NO; + options.SymPattern = YES; if (!iam) { print_sp_ienv_dist(&options); diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index d61db546..f855a485 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -55,10 +55,10 @@ int main(int argc, char *argv[]) gridinfo_t grid; double *berr; double *b, *xtrue, *b1; - int i, j, m, n; + int i, j, m, n, ii; int nprow, npcol; int iam, info, ldb, ldx, nrhs; - char **cpp, c; + char **cpp, c, *postfix; FILE *fp, *fopen(); int cpp_defs(); @@ -118,11 +118,19 @@ int main(int argc, char *argv[]) CHECK_MALLOC(iam, "Enter main()"); #endif + for(ii = 0;ii= 0 && iam < 6 ) { /* I am in grid 1. */ iam = grid1.iam; /* Get the logical number in the new grid. */ /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid1); - + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); + + if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); @@ -210,7 +220,7 @@ int main(int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ - dcreate_matrix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &grid2); + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index bb84a794..ed96c65e 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -11,8 +11,6 @@ set(headers colamd.h timer.h environment.hpp - blas.hpp - lapack.hpp TreeBcast_v2.hpp TreeReduce_v2.hpp TreeBcast_v2_impl.hpp @@ -22,8 +20,6 @@ set(headers # first: precision-independent files set(sources global.cpp - blas.cpp - lapack.cpp TreeInterface.cpp sp_ienv.c etree.c @@ -66,8 +62,8 @@ if(enable_double) dreadhb.c dreadrb.c dreadtriple.c - dreadMM.c dbinary_io.c + dreadMM.c pdgsequ.c pdlaqgs.c dldperm_dist.c diff --git a/SRC/Makefile b/SRC/Makefile index 63ac6751..1aa5723e 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -29,11 +29,11 @@ include ../make.inc # # Precision independent routines # -ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ +ALLAUX = global.o blas.o lapack.o TreeInterface.o sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ mmd.o comm.o memory.o util.o superlu_grid.o \ pxerr_dist.o superlu_timer.o symbfact.o \ psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \ - static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o + static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o colamd.o ifeq "${ACC}" "GPU" ALLAUX += cublas_utils.o @@ -51,7 +51,7 @@ ZSLUSRC = dcomplex_dist.o zlangs_dist.o zgsequ_dist.o zlaqgs_dist.o \ # # Routines for double precision parallel SuperLU DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ - dreadhb.o dreadrb.o dreadtriple.o dreadMM.o \ + dreadhb.o dreadrb.o dreadtriple.o dreadMM.o dbinary_io.o \ pdgsequ.o pdlaqgs.o dldperm_dist.o pdlangs.o pdutil.o \ pdsymbfact_distdata.o ddistribute.o pddistribute.o \ pdgstrf.o pdgstrf2.o pdGetDiagU.o \ @@ -90,7 +90,7 @@ pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) .cpp.o: - $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) + $(CPP) $(CPPFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) .f.o: $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) diff --git a/SRC/TreeBcast_v2.hpp b/SRC/TreeBcast_v2.hpp index 6826e61f..7b9d682f 100644 --- a/SRC/TreeBcast_v2.hpp +++ b/SRC/TreeBcast_v2.hpp @@ -2,7 +2,7 @@ #define _PEXSI_TREE_V2_HPP_ #include "environment.hpp" - #include "blas.hpp" +// #include "blas.hpp" #include "timer.h" #include "superlu_defs.h" @@ -22,7 +22,7 @@ -namespace PEXSI{ +namespace ASYNCOMM{ extern std::map< MPI_Comm , std::vector > commGlobRanks; @@ -64,13 +64,6 @@ namespace PEXSI{ MPI_Datatype type_; -#ifdef COMM_PROFILE_BCAST - protected: - Int myGRoot_; - Int myGRank_; - public: - inline void SetGlobalComm(const MPI_Comm & pGComm); -#endif protected: virtual void buildTree(Int * ranks, Int rank_cnt)=0; @@ -93,27 +86,18 @@ namespace PEXSI{ virtual inline Int GetNumSendMsg(); inline void SetDataReady(bool rdy); inline void SetTag(Int tag); - inline int GetTag(); + inline Int GetTag(); Int * GetDests(); Int GetDest(Int i); Int GetDestCount(); Int GetRoot(); bool IsRoot(); - bool StartForward(); void SetMsgSize(Int msgSize){ this->msgSize_ = msgSize;} Int GetMsgSize(); - bool IsDone(); bool IsReady(){ return this->isReady_;} - - virtual void SetLocalBuffer(T * locBuffer); - virtual T * GetLocalBuffer(); - //async wait and forward - virtual bool Progress(); virtual void AllocateBuffer(); - //blocking wait - void Wait(); virtual void cleanupBuffers(); @@ -122,13 +106,6 @@ namespace PEXSI{ virtual void forwardMessageSimple(T * locBuffer); virtual void waitSendRequest(); - - protected: - virtual void postRecv(); - virtual void forwardMessage(); - virtual void copyLocalBuffer(T* destBuffer); - virtual bool isMessageForwarded(); - virtual bool IsDataReceived(); }; @@ -164,23 +141,9 @@ namespace PEXSI{ }; - template< typename T> - void TreeBcast_Waitsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags); - - template< typename T> - void TreeBcast_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags); - - template< typename T> - void TreeBcast_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedEpochs); - - template< typename T> - void TreeBcast_Waitall(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees); - - - -}//namespace PEXSI +}//namespace ASYNCOMM #include "TreeBcast_v2_impl.hpp" #endif diff --git a/SRC/TreeBcast_v2_impl.hpp b/SRC/TreeBcast_v2_impl.hpp index a8840d9c..6ca7f639 100644 --- a/SRC/TreeBcast_v2_impl.hpp +++ b/SRC/TreeBcast_v2_impl.hpp @@ -6,28 +6,7 @@ // #include "TreeBcast_v2.hpp" -namespace PEXSI{ -#ifdef COMM_PROFILE_BCAST - template< typename T> - inline void TreeBcast_v2::SetGlobalComm(const MPI_Comm & pGComm){ - if(commGlobRanks.count(comm_)==0){ - MPI_Group group2 = MPI_GROUP_NULL; - MPI_Comm_group(pGComm, &group2); - MPI_Group group1 = MPI_GROUP_NULL; - MPI_Comm_group(comm_, &group1); - - Int size; - MPI_Comm_size(comm_,&size); - vector globRanks(size); - vector Lranks(size); - for(int i = 0; i @@ -157,51 +136,10 @@ namespace PEXSI{ tag_ = tag; } template< typename T> - inline int TreeBcast_v2::GetTag(){ + inline Int TreeBcast_v2::GetTag(){ return tag_; } - template< typename T> - inline bool TreeBcast_v2::IsDone(){ - return done_; - } - - - template< typename T> - inline bool TreeBcast_v2::IsDataReceived(){ - bool retVal = false; - if(myRank_==myRoot_){ - retVal = isReady_; - } - else if(recvCount_ == 1){ - retVal = true; - } - else if(recvRequests_[0] == MPI_REQUEST_NULL ){ - //post the recv - postRecv(); - retVal = false; - } - else if(recvRequests_[0] != MPI_REQUEST_NULL ){ -#if ( _DEBUGlevel_ >= 1 ) || defined(BCAST_VERBOSE) - statusOFS<recvCount_++; - } - - retVal = flag==1; - if(recvCount_==recvPostedCount_){ - //mark that we are ready to send / forward - isReady_ = true; - } - } - return retVal; - } template< typename T> inline Int * TreeBcast_v2::GetDests(){ @@ -225,58 +163,12 @@ namespace PEXSI{ return this->myRoot_==this->myRank_; } - template< typename T> - inline bool TreeBcast_v2::StartForward(){ - return this->fwded_==true; - } template< typename T> inline Int TreeBcast_v2::GetMsgSize(){ return this->msgSize_; } - template< typename T> - inline void TreeBcast_v2::forwardMessage( ){ - if(this->isReady_){ -#if ( _DEBUGlevel_ >= 1 ) || defined(BCAST_VERBOSE) - // std::cout<myRank_<<" FORWARDING on tag "<tag_<myRank_<<" FORWARDING on tag "<tag_<sendRequests_.size()!=this->GetDestCount()){ - this->sendRequests_.assign(this->GetDestCount(),MPI_REQUEST_NULL); - } - - for( Int idxRecv = 0; idxRecv < this->myDests_.size(); ++idxRecv ){ - Int iProc = this->myDests_[idxRecv]; - // Use Isend to send to multiple targets - int error_code = MPI_Isend( this->recvDataPtrs_[0], this->msgSize_, this->type_, - iProc, this->tag_,this->comm_, &this->sendRequests_[idxRecv] ); -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<= 1 ) || defined(BCAST_VERBOSE) - statusOFS<myRank_<<" FWD to "<tag_<myGRank_,commGlobRanks[this->comm_][iProc],this->tag_,this->msgSize_); -#endif - this->sendPostedCount_++; - } // for (iProc) - this->fwded_ = true; - } - } - template< typename T> @@ -339,92 +231,12 @@ namespace PEXSI{ this->sendDoneIdx_.shrink_to_fit(); this->sendDataPtrs_.shrink_to_fit(); this->sendTempBuffer_.shrink_to_fit(); + + this->myDests_.clear(); + } - template< typename T> - inline void TreeBcast_v2::SetLocalBuffer(T * locBuffer){ - //if recvDataPtrs_[0] has been allocated as a temporary buffer - if(this->recvDataPtrs_[0]!=NULL && this->recvDataPtrs_[0]!=locBuffer){ - //If we have received some data, we need to copy - //it to the new buffer - if(this->recvCount_>0){ - double t1,t2; - TIC(t1); - copyLocalBuffer(locBuffer); - TOC(t2,t1); - - } - - //If data hasn't been forwarded yet, - //it is safe to clear recvTempBuffer_ now - if(!this->fwded_){ - this->recvTempBuffer_.clear(); - } - } - - this->recvDataPtrs_[0] = locBuffer; - } - - - template< typename T> - inline bool TreeBcast_v2::isMessageForwarded(){ - bool retVal=false; - - if(!this->fwded_){ - //If data has been received but not forwarded - if(IsDataReceived()){ - forwardMessage(); - } - retVal = false; - } - else{ - //If data has been forwared, check for completion of send requests - int destCount = this->myDests_.size(); - int completed = 0; - if(destCount>0){ - //test the send requests - int flag = 0; - - this->sendDoneIdx_.resize(this->GetDestCount()); -#ifndef CHECK_MPI_ERROR - MPI_Testsome(destCount,this->sendRequests_.data(),&completed,this->sendDoneIdx_.data(),MPI_STATUSES_IGNORE); -#else - this->sendStatuses_.resize(destCount); - int error_code = MPI_Testsome(destCount,this->sendRequests_.data(),&completed,this->sendDoneIdx_.data(),this->sendStatuses_.data()); - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<sendStatuses_.size();i++){ - error_code = this->sendStatuses_[i].MPI_ERROR; - if(error_code != MPI_SUCCESS){ - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<sendCount_ += completed; - retVal = this->sendCount_ == this->sendPostedCount_; - - } - return retVal; - } - template< typename T> inline void TreeBcast_v2::AllocateBuffer() { @@ -438,96 +250,6 @@ namespace PEXSI{ } } - - - //async wait and forward - template< typename T> - inline bool TreeBcast_v2::Progress(){ - bool retVal = this->done_; - - if(!retVal){ - retVal = isMessageForwarded(); - - if(retVal){ - //if the local buffer has been set by the user, but the temporary - //buffer was already in use, we can clear it now - if(this->recvTempBuffer_.size()>0){ - if(this->recvDataPtrs_[0]!=(T*)this->recvTempBuffer_.data()){ - this->recvTempBuffer_.clear(); - } - } - - //free the unnecessary arrays - this->sendRequests_.clear(); -#if ( _DEBUGlevel_ >= 1 ) || defined(BCAST_VERBOSE) - statusOFS<myRank_<<" EVERYTHING COMPLETED on tag "<tag_<done_ = retVal; - return retVal; - - } - - //blocking wait - template< typename T> - inline void TreeBcast_v2::Wait(){ - if(!this->done_){ - while(!Progress()); - } - } - - template< typename T> - inline T* TreeBcast_v2::GetLocalBuffer(){ - assert(this->recvDataPtrs_.size()>0); - assert(this->recvDataPtrs_[0]!=nullptr); - return this->recvDataPtrs_[0]; - } - - template< typename T> - inline void TreeBcast_v2::postRecv() - { -#if ( _DEBUGlevel_ >= 1 ) || defined(BCAST_VERBOSE) - statusOFS<myRank_<<" POSTING RECV on tag "<tag_<recvCount_<1 && this->recvRequests_[0]==MPI_REQUEST_NULL && !this->IsRoot() ){ - - if(this->recvDataPtrs_[0]==NULL){ - this->recvTempBuffer_.resize(this->msgSize_); - this->recvDataPtrs_[0] = (T*)this->recvTempBuffer_.data(); - } - int error_code = MPI_Irecv( (char*)this->recvDataPtrs_[0], this->msgSize_, this->type_, - this->myRoot_, this->tag_,this->comm_, &this->recvRequests_[0] ); -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<recvPostedCount_=1; - } - } - - - - template< typename T> - inline void TreeBcast_v2::copyLocalBuffer(T* destBuffer){ - std::copy((T*)this->recvDataPtrs_[0],(T*)this->recvDataPtrs_[0]+this->msgSize_,destBuffer); - } - - template< typename T> inline TreeBcast_v2 * TreeBcast_v2::Create(const MPI_Comm & pComm, Int * ranks, Int rank_cnt, Int msgSize, double rseed){ //get communicator size @@ -798,117 +520,7 @@ namespace PEXSI{ } - - template< typename T> - void TreeBcast_Waitsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags){ - doneIdx.clear(); - auto all_done = [](const std::vector & boolvec){ - return std::all_of(boolvec.begin(), boolvec.end(), [](bool v) { return v; }); - }; - - while(doneIdx.empty() && !all_done(finishedFlags) ){ - - //for(auto it = finishedFlags.begin();it!=finishedFlags.end();it++){ - // statusOFS<<(*it?"1":"0")<<" "; - //} - //statusOFS<Progress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - } - - template< typename T> - void TreeBcast_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags){ - doneIdx.clear(); - for(int i = 0; iProgress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - - template< typename T> - void TreeBcast_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedEpochs){ - doneIdx.clear(); - assert(finishedEpochs.size()==treeIdx.size()+1); - Int curEpoch = ++finishedEpochs.back(); - for(int i = 0; iProgress(); - if(done){ - if(finishedEpochs[i]<=0){ - doneIdx.push_back(i); - finishedEpochs[i] = curEpoch; - } - } - } - else{ - finishedEpochs[i] = curEpoch; - } - } - } - - - - template< typename T> - void TreeBcast_Waitall(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees){ - std::list doneIdx; - std::vector finishedFlags(treeIdx.size(),false); - - doneIdx.clear(); - auto all_done = [](const std::vector & boolvec){ - return std::all_of(boolvec.begin(), boolvec.end(), [](bool v) { return v; }); - }; - - while(!all_done(finishedFlags) ){ - for(int i = 0; iProgress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - } - -} //namespace PEXSI +} //namespace ASYNCOMM #endif diff --git a/SRC/TreeInterface.cpp b/SRC/TreeInterface.cpp index 1aa1204e..ecff0344 100644 --- a/SRC/TreeInterface.cpp +++ b/SRC/TreeInterface.cpp @@ -1,7 +1,7 @@ #include "TreeReduce_v2.hpp" -namespace PEXSI{ +namespace ASYNCOMM{ @@ -9,91 +9,28 @@ namespace PEXSI{ extern "C" { #endif - void TreeTest(void *tree) { - std::cout<<" ahhh good! "<0); TreeBcast_v2* BcastTree = TreeBcast_v2::Create(comm,ranks,rank_cnt,msgSize,rseed); return (BcTree) BcastTree; } - void BcTree_SetTag(BcTree Tree, Int tag){ + void BcTree_Destroy(BcTree Tree){ TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - BcastTree->SetTag(tag); - } - - yes_no_t BcTree_Progress(BcTree Tree){ - TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - bool done = BcastTree->Progress(); - // std::cout<* BcastLTree = (TreeBcast_v2*) Tree; - BcastLTree->AllocateBuffer(); - } - + delete BcastTree; + } - // Int BcTree_Iprobe(BcTree Tree, MPI_Status* status){ - // TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - // Int flag; - - // MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, BcastTree->comm_, &flag, status); - // if(flag!=0){ - // printf("hahah %5d", flag); - // fflush(stdout); - // } - // return flag; - // } - - - void BcTree_SetDataReady(BcTree Tree){ + void BcTree_SetTag(BcTree Tree, Int tag){ TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - BcastTree->SetDataReady(true); + BcastTree->SetTag(tag); } - void BcTree_SetLocalBuffer(BcTree Tree, void* localBuffer){ - TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - BcastTree->SetLocalBuffer( (double*) localBuffer); - } yes_no_t BcTree_IsRoot(BcTree Tree){ TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; return BcastTree->IsRoot()?YES:NO; } - yes_no_t BcTree_StartForward(BcTree Tree){ - TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; - return BcastTree->StartForward()?YES:NO; - } - - - - void BcTree_Testsome(StdList TreeIdx, BcTree* ArrTrees, int* Outcount, int* FinishedTrees){ - std::list* treeIdx = (std::list*)TreeIdx; - int i=0, idone=0; - bool done; - TreeBcast_v2* curTree; - - for (std::list::iterator itr = (*treeIdx).begin(); itr != (*treeIdx).end(); /*nothing*/){ - curTree = (TreeBcast_v2*) ArrTrees[*itr]; - assert(curTree!=nullptr); - done = curTree->Progress(); - if(done){ - FinishedTrees[idone] = *itr; /*store finished tree numbers */ - ++idone; - itr = (*treeIdx).erase(itr); - }else{ - ++itr; - } - ++i; - } - *Outcount = idone; - } - void BcTree_forwardMessageSimple(BcTree Tree, void* localBuffer){ TreeBcast_v2* BcastTree = (TreeBcast_v2*) Tree; @@ -119,128 +56,78 @@ namespace PEXSI{ StdList StdList_Init(){ - std::list* lst = new std::list(); + std::list* lst = new std::list(); return (StdList) lst; } - void StdList_Pushback(StdList lst, int dat){ - std::list* list = (std::list*) lst; + void StdList_Pushback(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; list->push_back(dat); } - - yes_no_t StdList_Find(StdList lst, int dat){ - std::list* list = (std::list*) lst; - for (std::list::iterator itr = (*list).begin(); itr != (*list).end(); /*nothing*/){ + + void StdList_Pushfront(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; + list->push_front(dat); + } + + int_t StdList_Popfront(StdList lst){ + std::list* list = (std::list*) lst; + int_t dat = -1; + if((*list).begin()!=(*list).end()){ + dat = (*list).front(); + list->pop_front(); + } + return dat; + } + + yes_no_t StdList_Find(StdList lst, int_t dat){ + std::list* list = (std::list*) lst; + for (std::list::iterator itr = (*list).begin(); itr != (*list).end(); /*nothing*/){ if(*itr==dat)return YES; ++itr; } return NO; } - int StdList_Size(StdList lst){ - std::list* list = (std::list*) lst; + int_t StdList_Size(StdList lst){ + std::list* list = (std::list*) lst; return list->size(); } + yes_no_t StdList_Empty(StdList lst){ + std::list* list = (std::list*) lst; + return (*list).begin()==(*list).end()?YES:NO; + } + + RdTree RdTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed){ + assert(msgSize>0); TreeReduce_v2* ReduceTree = TreeReduce_v2::Create(comm,ranks,rank_cnt,msgSize,rseed); return (RdTree) ReduceTree; } - - void RdTree_SetTag(RdTree Tree, Int tag){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - ReduceTree->SetTag(tag); - } - int RdTree_GetTag(RdTree Tree){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - return ReduceTree->GetTag(); - } - - - int RdTree_GetDestCount(RdTree Tree){ + void RdTree_Destroy(RdTree Tree){ TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - return ReduceTree->GetDestCount(); + delete ReduceTree; } - yes_no_t RdTree_Progress(RdTree Tree){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - bool done = ReduceTree->Progress(); - // std::cout<* ReduceTree = (TreeReduce_v2*) Tree; - // ReduceTree->postRecv(); - // } - - - void RdTree_SetDataReady(RdTree Tree){ + void RdTree_SetTag(RdTree Tree, Int tag){ TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - ReduceTree->SetDataReady(true); + ReduceTree->SetTag(tag); } - - void RdTree_AllocRecvBuffers(RdTree Tree){ + int RdTree_GetDestCount(RdTree Tree){ TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - ReduceTree->AllocRecvBuffers(); + return ReduceTree->GetDestCount(); } - - void RdTree_SetLocalBuffer(RdTree Tree, void* localBuffer){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - ReduceTree->SetLocalBuffer( (double*) localBuffer); - } yes_no_t RdTree_IsRoot(RdTree Tree){ TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; return ReduceTree->IsRoot()?YES:NO; } - - yes_no_t RdTree_IsReady(RdTree Tree){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - return ReduceTree->IsReady()?YES:NO; - } - - - yes_no_t RdTree_StartForward(RdTree Tree){ - TreeReduce_v2* ReduceTree = (TreeReduce_v2*) Tree; - return ReduceTree->StartForward()?YES:NO; - } - - - - void RdTree_Testsome(StdList TreeIdx, RdTree* ArrTrees, int* Outcount, int* FinishedTrees){ - std::list* treeIdx = (std::list*)TreeIdx; - int i=0, idone=0; - bool done; - TreeReduce_v2* curTree; - - for (std::list::iterator itr = (*treeIdx).begin(); itr != (*treeIdx).end(); /*nothing*/){ - curTree = (TreeReduce_v2*) ArrTrees[*itr]; - assert(curTree!=nullptr); - done = curTree->Progress(); - - // if(*itr==9977){ - // std::cout<<"still good"<IsRoot()<* ReduceTree = (TreeReduce_v2*) Tree; @@ -268,5 +155,5 @@ namespace PEXSI{ -} //namespace PEXSI +} //namespace ASYNCOMM diff --git a/SRC/TreeReduce_v2.hpp b/SRC/TreeReduce_v2.hpp index 1125b7a0..817bc233 100644 --- a/SRC/TreeReduce_v2.hpp +++ b/SRC/TreeReduce_v2.hpp @@ -13,7 +13,7 @@ -namespace PEXSI{ +namespace ASYNCOMM{ @@ -40,12 +40,6 @@ namespace PEXSI{ virtual inline Int GetNumMsgToSend(){return this->myRank_==this->myRoot_?0:1;} virtual inline Int GetNumMsgToRecv(){return this->GetDestCount();} - - virtual void AllocRecvBuffers(); - - - - virtual void SetLocalBuffer(T * locBuffer); virtual T * GetLocalBuffer(); @@ -53,27 +47,6 @@ namespace PEXSI{ virtual void forwardMessageSimple(T * locBuffer); virtual void allocateRequest(); virtual void waitSendRequest(); - - //async wait and forward - virtual bool Progress(); - - - // void CopyLocalBuffer(T* destBuffer){ - // std::copy((char*)myData_,(char*)myData_+GetMsgSize(),(char*)destBuffer); - // } - - - - protected: - virtual void reduce( Int idxRecv, Int idReq); - virtual void forwardMessage(); - virtual void postRecv(); - virtual bool IsDataReceived(); - virtual bool isMessageForwarded(); - - - - }; @@ -84,9 +57,6 @@ class FTreeReduce_v2: public TreeReduce_v2{ public: FTreeReduce_v2(const MPI_Comm & pComm, Int * ranks, Int rank_cnt, Int msgSize); virtual FTreeReduce_v2 * clone() const; - virtual void postRecv(); - virtual void AllocRecvBuffers(); - virtual bool Progress(); }; @@ -138,26 +108,7 @@ class PalmTreeReduce_v2: public TreeReduce_v2{ }; - - - - template< typename T> - void TreeReduce_Waitsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags); - - template< typename T> - void TreeReduce_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags); - - template< typename T> - void TreeReduce_Waitall(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees); - - - template< typename T> - void TreeReduce_ProgressAll(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees); - - - - -}//namespace PEXSI +}//namespace ASYNCOMM #include "TreeReduce_v2_impl.hpp" #endif diff --git a/SRC/TreeReduce_v2_impl.hpp b/SRC/TreeReduce_v2_impl.hpp index 502d767f..b6dd4f13 100644 --- a/SRC/TreeReduce_v2_impl.hpp +++ b/SRC/TreeReduce_v2_impl.hpp @@ -6,7 +6,7 @@ // #include "TreeReduce_v2.hpp" -namespace PEXSI{ +namespace ASYNCOMM{ template TreeReduce_v2::TreeReduce_v2(const MPI_Comm & pComm, Int * ranks, Int rank_cnt, Int msgSize):TreeBcast_v2(pComm,ranks,rank_cnt,msgSize){ @@ -44,121 +44,6 @@ namespace PEXSI{ this->cleanupBuffers(); } - - - - template - inline void TreeReduce_v2::postRecv(){ - if(this->GetDestCount()>this->recvPostedCount_){ - for( Int idxRecv = 0; idxRecv < this->myDests_.size(); ++idxRecv ){ - Int iProc = this->myDests_[idxRecv]; - int error_code = MPI_Irecv( (char*)this->recvDataPtrs_[idxRecv], this->msgSize_, this->type_, - iProc, this->tag_,this->comm_, &this->recvRequests_[idxRecv] ); -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<recvPostedCount_++; - } // for (iProc) - } - } - - - - - template - inline void TreeReduce_v2::reduce( Int idxRecv, Int idReq){ - //add thing to my data -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"Contribution received:"<msgSize_;i++){ - statusOFS<recvDataPtrs_[idxRecv][i]<<" "; - if(i%10==0){statusOFS<= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"Reduced before:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS<msgSize_, ONE(), this->recvDataPtrs_[idxRecv], 1, this->sendDataPtrs_[0], 1 ); -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"Reduced after:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS< - inline void TreeReduce_v2::forwardMessage(){ - if(this->isReady_){ - if(this->myRank_!=this->myRoot_){ - //forward to my root if I have reseived everything - Int iProc = this->myRoot_; - // Use Isend to send to multiple targets - if(this->sendDataPtrs_.size()<1){ - this->sendDataPtrs_.assign(1,NULL); - } - - int msgsz = this->sendDataPtrs_[0]==NULL?0:this->msgSize_; - - int error_code = MPI_Isend((char*)this->sendDataPtrs_[0], msgsz, this->type_, - iProc, this->tag_,this->comm_, &this->sendRequests_[0] ); -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<sendPostedCount_++; -#ifdef COMM_PROFILE - PROFILE_COMM(this->myGRank_,this->myGRoot_,this->tag_,msgsz); -#endif - -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - statusOFS<myRank_<<" FWD to "<tag_<fwded_ = true; - } - } - - template< typename T> inline void TreeReduce_v2::forwardMessageSimple(T * locBuffer){ @@ -197,292 +82,14 @@ namespace PEXSI{ MPI_Wait(&this->sendRequests_[0],&status) ; } } - - - - template< typename T> - inline bool TreeReduce_v2::IsDataReceived(){ - bool retVal = false; - if(this->isReady_){ - if(this->recvCount_== this->GetDestCount()){ -// if(this->tag_==12){gdb_lock();} -// if(this->tag_==9){gdb_lock();} - retVal = true; - } - else if(this->recvCount_recvPostedCount_){ -// if(this->tag_==12){gdb_lock();} -// if(this->tag_==9){gdb_lock();} - //mpi_test_some on recvRequests_ - int recvCount = -1; - int reqCnt = this->recvRequests_.size();//this->recvPostedCount_-this->recvCount_;//GetDestCount(); - // assert(reqCnt <= this->recvRequests_.size()); - - int error_code = MPI_Testsome(reqCnt,&this->recvRequests_[0],&recvCount,&this->recvDoneIdx_[0],&this->recvStatuses_[0]); - -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<recvStatuses_.size();i++){ - error_code = this->recvStatuses_[i].MPI_ERROR; - if(error_code != MPI_SUCCESS){ - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<recvDoneIdx_[i]; - if(idx!=MPI_UNDEFINED){ - Int size = 0; - MPI_Get_count(&this->recvStatuses_[i], MPI_BYTE, &size); - - -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - statusOFS<myRank_<<" RECVD from "<recvStatuses_[i].MPI_SOURCE<<" on tag "<tag_<0){ - //resize if needed - if(this->sendDataPtrs_.size()<1){ - this->sendDataPtrs_.assign(1,NULL); - } - - //If sendDataPtrs is 0, allocate to the size of what has been received - if(this->sendDataPtrs_[0]==NULL){ - this->sendTempBuffer_.resize(this->msgSize_); - this->sendDataPtrs_[0] = (T*)&this->sendTempBuffer_[0]; - Int nelem = this->msgSize_; - std::fill(this->sendDataPtrs_[0],this->sendDataPtrs_[0]+nelem,ZERO()); - } - - //This is where the handle would be called - reduce(idx,i); - - } - - this->recvCount_++; - } - } - - if(this->recvCount_== this->GetDestCount()){ - retVal = true; - } - else{ - retVal = false; - } - } - else if(this->recvPostedCount_GetDestCount()){ -// if(this->tag_==12){gdb_lock();} -// if(this->tag_==9){gdb_lock();} - this->postRecv(); - retVal = false; - } - }else{ - // changed by Yang Liu: if not ready_, no active message, but expecting incoming message, then post recv - if(this->recvCount_==this->recvPostedCount_){ - if(this->recvPostedCount_GetDestCount()){ - this->postRecv(); - retVal = false; - } - } - } - - return retVal; - } - - template< typename T> - inline bool TreeReduce_v2::Progress(){ - - - bool retVal = false, tmp; - - // if(this->tag_==15 && this->IsRoot()){ - // std::cout<GetDestCount()<<" "<recvPostedCount_<<"YESYES"<done_){ - retVal = true; - } - else{ - //Do we need this ? - AllocRecvBuffers(); - - if(this->isAllocated_){ - - tmp = this->IsDataReceived(); // changed by Yang Liu, test receiving even not isReady_ - - - if(this->myRank_==this->myRoot_ && this->isAllocated_){ - this->isReady_=true; - this->isBufferSet_=true; - } - - if(this->isReady_ && this->isBufferSet_){ - if(this->IsDataReceived()){ - - //free the unnecessary arrays - this->recvTempBuffer_.clear(); - this->recvRequests_.clear(); - this->recvStatuses_.clear(); - this->recvDoneIdx_.clear(); - - if(this->isMessageForwarded()){ - retVal = true; - } - } - } - } - } - - if(retVal){ - this->done_ = retVal; - //TODO do some smart cleanup here - } - return retVal; - } template< typename T> inline T * TreeReduce_v2::GetLocalBuffer(){ return this->sendDataPtrs_[0]; } - template< typename T> - inline void TreeReduce_v2::SetLocalBuffer(T * locBuffer){ - if(this->sendDataPtrs_.size()<1){ - this->sendDataPtrs_.assign(1,NULL); - } - - - if(!this->IsRoot()){ - //if not root, we need to allocate a temp buffer anyway - if(this->sendDataPtrs_[0]==NULL){ - this->sendTempBuffer_.resize(this->msgSize_); - this->sendDataPtrs_[0] = (T*)&this->sendTempBuffer_[0]; - Int nelem = this->msgSize_; - std::fill(this->sendDataPtrs_[0],this->sendDataPtrs_[0]+nelem,ZERO()); - } - if(!this->isBufferSet_){ -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"Buffer before:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS<= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"External buffer:"<msgSize_;i++){ - statusOFS<msgSize_, ONE(), locBuffer, 1, this->sendDataPtrs_[0], 1 ); -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"Buffer after:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS<isBufferSet_= true; - } - else{ - - if(this->sendDataPtrs_[0]!=NULL && this->sendDataPtrs_[0]!=locBuffer){ -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"ROOT Buffer before:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS<= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"ROOT External buffer:"<msgSize_;i++){ - statusOFS<msgSize_, ONE(), this->sendDataPtrs_[0], 1, locBuffer, 1 ); - this->sendTempBuffer_.clear(); - this->sendDataPtrs_[0] = locBuffer; -#if ( _DEBUGlevel_ >= 1 ) || defined(REDUCE_VERBOSE) - { - statusOFS<<"[tag="<tag_<<"] "<<"ROOT Buffer after:"<msgSize_;i++){ - statusOFS<sendDataPtrs_[0][i]<<" "; - if(i%10==0){statusOFS< - inline void TreeReduce_v2::AllocRecvBuffers(){ - if(!this->isAllocated_){ - this->recvDataPtrs_.assign(this->GetDestCount(),NULL); - this->recvTempBuffer_.resize(this->GetDestCount()*this->msgSize_); - - for( Int idxRecv = 0; idxRecv < this->GetDestCount(); ++idxRecv ){ - this->recvDataPtrs_[idxRecv] = (T*)&(this->recvTempBuffer_[idxRecv*this->msgSize_]); - } - - this->recvRequests_.assign(this->GetDestCount(),MPI_REQUEST_NULL); - this->recvStatuses_.resize(this->GetDestCount()); - this->recvDoneIdx_.resize(this->GetDestCount()); - - this->sendRequests_.assign(1,MPI_REQUEST_NULL); - - this->isAllocated_ = true; - } - } template< typename T> inline void TreeReduce_v2::Reset(){ @@ -491,70 +98,6 @@ namespace PEXSI{ this->isBufferSet_=false; } - template< typename T> - inline bool TreeReduce_v2::isMessageForwarded(){ - bool retVal=false; -// if(this->tag_==12){gdb_lock();} -// if(this->tag_==9){gdb_lock();} - - if(!this->fwded_){ - //If data has been received but not forwarded - if(this->IsDataReceived()){ - this->forwardMessage(); - } - retVal = false; - } - else{ - //If data has been forwared, check for completion of send requests - int destCount = this->myRank_==this->myRoot_?0:1; - int completed = 0; - if(destCount>0){ - //test the send requests - int flag = 0; - - this->sendDoneIdx_.resize(destCount); - -// if(this->tag_==12){gdb_lock();} - -#ifndef CHECK_MPI_ERROR - MPI_Testsome(destCount,this->sendRequests_.data(),&completed,this->sendDoneIdx_.data(),MPI_STATUSES_IGNORE); -#else - this->sendStatuses_.resize(destCount); - int error_code = MPI_Testsome(destCount,this->sendRequests_.data(),&completed,this->sendDoneIdx_.data(),this->sendStatuses_.data()); - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<sendStatuses_.size();i++){ - error_code = this->sendStatuses_[i].MPI_ERROR; - if(error_code != MPI_SUCCESS){ - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<sendCount_ += completed; - retVal = this->sendCount_ == this->sendPostedCount_; - - //MPI_Testall(destCount,sendRequests_.data(),&flag,MPI_STATUSES_IGNORE); - //retVal = flag==1; - } - return retVal; - } - template< typename T> inline TreeReduce_v2 * TreeReduce_v2::Create(const MPI_Comm & pComm, Int * ranks, Int rank_cnt, Int msgSize, double rseed){ @@ -621,102 +164,6 @@ namespace PEXSI{ #endif } - template< typename T> - inline void FTreeReduce_v2::postRecv() - { - if(this->isAllocated_ && this->GetDestCount()>this->recvPostedCount_){ - int error_code = MPI_Irecv( (char*)this->recvDataPtrs_[0], this->msgSize_, this->type_, - MPI_ANY_SOURCE, this->tag_,this->comm_, &this->recvRequests_[0] ); -#ifdef CHECK_MPI_ERROR - if(error_code!=MPI_SUCCESS){ - char error_string[BUFSIZ]; - int length_of_error_string, error_class; - - MPI_Error_class(error_code, &error_class); - MPI_Error_string(error_class, error_string, &length_of_error_string); - statusOFS<recvPostedCount_++; - } - } - - template< typename T> - inline void FTreeReduce_v2::AllocRecvBuffers(){ - if(!this->isAllocated_){ - this->recvDataPtrs_.assign(1,NULL); - this->recvTempBuffer_.resize(this->msgSize_); - - this->recvDataPtrs_[0] = (T*)&(this->recvTempBuffer_[0]); - - this->recvRequests_.assign(1,MPI_REQUEST_NULL); - this->recvStatuses_.resize(1); - this->recvDoneIdx_.resize(1); - this->sendRequests_.assign(1,MPI_REQUEST_NULL); - - this->isAllocated_ = true; - } - } - - - template< typename T> - inline bool FTreeReduce_v2::Progress(){ - - - bool retVal = false,tmp; - if(this->done_){ - retVal = true; - } - else{ - - this->AllocRecvBuffers(); - - if(this->isAllocated_){ - - tmp = this->IsDataReceived(); // changed by Yang Liu, test receiving even not isReady_ - - if(this->myRank_==this->myRoot_ && this->isAllocated_){ - this->isBufferSet_=true; - this->isReady_=true; - } - - if(this->isReady_ && this->isBufferSet_){ - if(this->IsDataReceived()){ - - - //free the unnecessary arrays - this->recvTempBuffer_.clear(); - this->recvRequests_.clear(); - this->recvStatuses_.clear(); - this->recvDoneIdx_.clear(); - - if(this->isMessageForwarded()){ - retVal = true; - } - } - //else if(this->recvPostedCount_GetDestCount()){ - // //TODO check this - // if(this->recvPostedCount_==this->recvCount_){ - // this->postRecv(); - // } - //} - } - } - } - - if(retVal){ - this->done_ = retVal; - //TODO do some smart cleanup - } - return retVal; - } - - template< typename T> BTreeReduce_v2::BTreeReduce_v2(const MPI_Comm & pComm, Int * ranks, Int rank_cnt, Int msgSize):TreeReduce_v2(pComm, ranks, rank_cnt, msgSize){ buildTree(ranks,rank_cnt); @@ -978,98 +425,7 @@ namespace PEXSI{ - template< typename T> - void TreeReduce_Waitsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags){ - doneIdx.clear(); - auto all_done = [](const std::vector & boolvec){ - return std::all_of(boolvec.begin(), boolvec.end(), [](bool v) { return v; }); - }; - - while(doneIdx.empty() && !all_done(finishedFlags) ){ - for(int i = 0; iProgress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - } - - template< typename T> - void TreeReduce_Testsome(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees, std::list & doneIdx, std::vector & finishedFlags){ - doneIdx.clear(); - for(int i = 0; iProgress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - - - template< typename T> - void TreeReduce_Waitall(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees){ - std::list doneIdx; - std::vector finishedFlags(treeIdx.size(),false); - - doneIdx.clear(); - auto all_done = [](const std::vector & boolvec){ - return std::all_of(boolvec.begin(), boolvec.end(), [](bool v) { return v; }); - }; - - while(!all_done(finishedFlags) ){ - for(int i = 0; iProgress(); - if(done){ - if(!finishedFlags[i]){ - doneIdx.push_back(i); - finishedFlags[i] = true; - } - } - } - else{ - finishedFlags[i] = true; - } - } - } - } - - template< typename T> - void TreeReduce_ProgressAll(std::vector & treeIdx, std::vector< std::shared_ptr > > & arrTrees){ - - for(int i = 0; iProgress(); - } - } - } - -} //namespace PEXSI +} //namespace ASYNCOMM #endif diff --git a/SRC/dreadrb.c b/SRC/dreadrb.c index 98e777ec..d62fb7bb 100644 --- a/SRC/dreadrb.c +++ b/SRC/dreadrb.c @@ -141,11 +141,7 @@ static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t pers i = 0; while (i < n) { fgets(buf, 100, fp); /* read a line at a time */ - // if(0==i%10000){ - // printf("line ind: %s\n", buf); - // fflush(stdout); - // } - for (j=0; j= 1) - if ( !iam ) { - printf("Matrix type %s\n", type); - fflush(stdout); - } + if ( !iam ) printf("Matrix type %s\n", type); #endif fscanf(fp, "%14c", buf); *nrow = atoi(buf); diff --git a/SRC/environment.hpp b/SRC/environment.hpp index 1ccc2328..fd58724e 100644 --- a/SRC/environment.hpp +++ b/SRC/environment.hpp @@ -1,45 +1,3 @@ -/* - Copyright (c) 2012 The Regents of the University of California, - through Lawrence Berkeley National Laboratory. - - Author: Lin Lin - - This file is part of PEXSI. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - (3) Neither the name of the University of California, Lawrence Berkeley - National Laboratory, U.S. Dept. of Energy nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or - upgrades to the features, functionality or performance of the source code - ("Enhancements") to anyone; however, if you choose to make your Enhancements - available either publicly, or directly to Lawrence Berkeley National - Laboratory, without imposing a separate written license agreement for such - Enhancements, then you hereby grant the following license: a non-exclusive, - royalty-free perpetual license to install, use, modify, prepare derivative - works, incorporate into other computer software, distribute, and sublicense - such enhancements or derivative works thereof, in binary and source code form. -*/ /// @file environment.hpp /// @brief Environmental variables. /// @date 2012-08-10 @@ -103,10 +61,10 @@ * Data types and constants **********************************************************************/ -/// @namespace PEXSI +/// @namespace ASYNCOMM /// @brief The main namespace. -namespace PEXSI{ +namespace ASYNCOMM{ // Basic data types @@ -131,34 +89,6 @@ typedef double Scalar; // IO extern std::ofstream statusOFS; -#ifdef GEMM_PROFILE -extern std::ofstream statOFS; -#include -extern std::deque gemm_stat; -#endif - -#if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST) -extern std::ofstream commOFS; -#include -extern std::deque comm_stat; - -#define PROFILE_COMM(sender,receiver,tag,size)\ -do{\ - comm_stat.push_back(sender);\ - comm_stat.push_back(receiver);\ - comm_stat.push_back(tag);\ - comm_stat.push_back(size);\ -}while(0) - -#define HEADER_COMM "Sender\tReceiver\tTag\tSize" -#define LINE_COMM(it) *it<<"\t"<<*(it+1)<<"\t"<<*(it+2)<<"\t"<<*(it+3) - -#else - -#define PROFILE_COMM(sender,receiver,tag,size) - -#endif - // ********************************************************************* // Define constants @@ -194,13 +124,13 @@ const char LOWER = 'L'; const Real au2K = 315774.67; const Real PI = 3.141592653589793; -} // namespace PEXSI +} // namespace ASYNCOMM /*********************************************************************** * Error handling **********************************************************************/ -namespace PEXSI{ +namespace ASYNCOMM{ @@ -291,7 +221,7 @@ namespace PEXSI{ } -} // namespace PEXSI +} // namespace ASYNCOMM #endif // _PEXSI_ENVIRONMENT_HPP_ diff --git a/SRC/global.cpp b/SRC/global.cpp index c435771f..6d2f4515 100644 --- a/SRC/global.cpp +++ b/SRC/global.cpp @@ -1,65 +1,13 @@ -/* - Copyright (c) 2012 The Regents of the University of California, - through Lawrence Berkeley National Laboratory. - - Author: Lin Lin - - This file is part of PEXSI. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - (3) Neither the name of the University of California, Lawrence Berkeley - National Laboratory, U.S. Dept. of Energy nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or - upgrades to the features, functionality or performance of the source code - ("Enhancements") to anyone; however, if you choose to make your Enhancements - available either publicly, or directly to Lawrence Berkeley National - Laboratory, without imposing a separate written license agreement for such - Enhancements, then you hereby grant the following license: a non-exclusive, - royalty-free perpetual license to install, use, modify, prepare derivative - works, incorporate into other computer software, distribute, and sublicense - such enhancements or derivative works thereof, in binary and source code form. -*/ #include "environment.hpp" #include -namespace PEXSI{ +namespace ASYNCOMM{ // ********************************************************************* // IO // ********************************************************************* std::ofstream statusOFS; -#ifdef GEMM_PROFILE - std::ofstream statOFS; - std::deque gemm_stat; -#endif - -#if defined(COMM_PROFILE) || defined(COMM_PROFILE_BCAST) - std::ofstream commOFS; - std::deque comm_stat; -#endif - // ********************************************************************* // Error handling @@ -87,4 +35,4 @@ namespace PEXSI{ } #endif // ifndef _RELEASE_ -} // namespace PEXSI +} // namespace ASYNCOMM diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c index a5a01a13..c9f2c859 100644 --- a/SRC/pddistribute.c +++ b/SRC/pddistribute.c @@ -1,13 +1,13 @@ /*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) + Copyright (c) 2003, The Regents of the University of California, through + Lawrence Berkeley National Laboratory (subject to receipt of any required + approvals from U.S. Dept. of Energy) -All rights reserved. + All rights reserved. -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ + The source code is distributed under BSD license, see the file License.txt + at the top-level directory. + */ /*! @file @@ -21,6 +21,11 @@ at the top-level directory. #include "superlu_ddefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + + /*! \brief * *
@@ -56,1455 +61,1926 @@ at the top-level directory.
  * ============
  * 
*/ -int_t + int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, - gridinfo_t *grid, int_t *colptr[], int_t *rowind[], - double *a[]) + Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, + gridinfo_t *grid, int_t *colptr[], int_t *rowind[], + double *a[]) { - NRformat_loc *Astore; - int_t *perm_r; /* row permutation vector */ - int_t *perm_c; /* column permutation vector */ - int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; - int_t nnz_loc; /* number of local nonzeros */ - int_t SendCnt; /* number of remote nonzeros to be sent */ - int_t RecvCnt; /* number of remote nonzeros to be sent */ - int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; - int_t *ia, *ja, **ia_send, *index, *itemp; - int_t *ptr_to_send; - double *aij, **aij_send, *nzval, *dtemp; - double *nzval_a; - int iam, it, p, procs; - MPI_Request *send_req; - MPI_Status status; - - - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - iam = grid->iam; + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; + int_t nnz_loc; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ptr_to_send; + double *aij, **aij_send, *nzval, *dtemp; + double *nzval_a; + int iam, it, p, procs; + MPI_Request *send_req; + MPI_Status status; + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dReDistribute_A()"); + CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - Astore = (NRformat_loc *) A->Store; - n = A->ncol; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - nnzToRecv = intCalloc_dist(2*procs); - nnzToSend = nnzToRecv + procs; - - - /* ------------------------------------------------------------ - COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, - THEN ALLOCATE SPACE. - THIS ACCOUNTS FOR THE FIRST PASS OF A. - ------------------------------------------------------------*/ - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - ++nnzToSend[p]; + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nnzToRecv = intCalloc_dist(2*procs); + nnzToSend = nnzToRecv + procs; + + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; + } } - } - /* All-to-all communication */ - MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, - grid->comm); + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); - maxnnzToRecv = 0; - nnz_loc = SendCnt = RecvCnt = 0; + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - SendCnt += nnzToSend[p]; - RecvCnt += nnzToRecv[p]; - maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); - } else { - nnz_loc += nnzToRecv[p]; - /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + } + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + + /* Allocate space for storing the triplets after redistribution. */ + if ( k ) { /* count can be zero. */ + if ( !(ia = intMalloc_dist(2*k)) ) + ABORT("Malloc fails for ia[]."); + if ( !(aij = doubleMalloc_dist(k)) ) + ABORT("Malloc fails for aij[]."); } - } - k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ - - /* Allocate space for storing the triplets after redistribution. */ - if ( k ) { /* count can be zero. */ - if ( !(ia = intMalloc_dist(2*k)) ) - ABORT("Malloc fails for ia[]."); - if ( !(aij = doubleMalloc_dist(k)) ) - ABORT("Malloc fails for aij[]."); - } - ja = ia + k; - - /* Allocate temporary storage for sending/receiving the A triplets. */ - if ( procs > 1 ) { - if ( !(send_req = (MPI_Request *) - SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); - if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) - ABORT("Malloc fails for ia_send[]."); - if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) - ABORT("Malloc fails for aij_send[]."); - if ( SendCnt ) { /* count can be zero */ - if ( !(index = intMalloc_dist(2*SendCnt)) ) - ABORT("Malloc fails for index[]."); - if ( !(nzval = doubleMalloc_dist(SendCnt)) ) - ABORT("Malloc fails for nzval[]."); - } - if ( !(ptr_to_send = intCalloc_dist(procs)) ) - ABORT("Malloc fails for ptr_to_send[]."); - if ( maxnnzToRecv ) { /* count can be zero */ - if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) - ABORT("Malloc fails for itemp[]."); - if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) - ABORT("Malloc fails for dtemp[]."); - } - - for (i = 0, j = 0, p = 0; p < procs; ++p) { - if ( p != iam ) { - ia_send[p] = &index[i]; - i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; - j += nnzToSend[p]; - } - } - } /* if procs > 1 */ - - if ( !(*colptr = intCalloc_dist(n+1)) ) - ABORT("Malloc fails for *colptr[]."); - - /* ------------------------------------------------------------ - LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. - THIS ACCOUNTS FOR THE SECOND PASS OF A. - ------------------------------------------------------------*/ - nnz_loc = 0; /* Reset the local nonzero count. */ - nzval_a = Astore->nzval; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ - jcol = Astore->colind[j]; - gbi = BlockNum( irow ); - gbj = BlockNum( jcol ); - p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); - - if ( p != iam ) { /* remote */ - k = ptr_to_send[p]; - ia_send[p][k] = irow; - ia_send[p][k + nnzToSend[p]] = jcol; - aij_send[p][k] = nzval_a[j]; - ++ptr_to_send[p]; - } else { /* local */ - ia[nnz_loc] = irow; - ja[nnz_loc] = jcol; - aij[nnz_loc] = nzval_a[j]; - ++nnz_loc; - ++(*colptr)[jcol]; /* Count nonzeros in each column */ - } + ja = ia + k; + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) + ABORT("Malloc fails for ia_send[]."); + if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) + ABORT("Malloc fails for aij_send[]."); + if ( SendCnt ) { /* count can be zero */ + if ( !(index = intMalloc_dist(2*SendCnt)) ) + ABORT("Malloc fails for index[]."); + if ( !(nzval = doubleMalloc_dist(SendCnt)) ) + ABORT("Malloc fails for nzval[]."); + } + if ( !(ptr_to_send = intCalloc_dist(procs)) ) + ABORT("Malloc fails for ptr_to_send[]."); + if ( maxnnzToRecv ) { /* count can be zero */ + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) + ABORT("Malloc fails for itemp[]."); + if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) + ABORT("Malloc fails for dtemp[]."); + } + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + if ( !(*colptr = intCalloc_dist(n+1)) ) + ABORT("Malloc fails for *colptr[]."); + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + ++(*colptr)[jcol]; /* Count nonzeros in each column */ + } + } } - } - - /* ------------------------------------------------------------ - PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. - NOTE: Can possibly use MPI_Alltoallv. - ------------------------------------------------------------*/ - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToSend[p]; - MPI_Isend( ia_send[p], it, mpi_int_t, - p, iam, grid->comm, &send_req[p] ); - it = nnzToSend[p]; - MPI_Isend( aij_send[p], it, MPI_DOUBLE, - p, iam+procs, grid->comm, &send_req[procs+p] ); + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. +NOTE: Can possibly use MPI_Alltoallv. +------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_DOUBLE, + p, iam+procs, grid->comm, &send_req[procs+p] ); + } } - } - - for (p = 0; p < procs; ++p) { - if ( p != iam ) { - it = 2*nnzToRecv[p]; - MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); - it = nnzToRecv[p]; - MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, - grid->comm, &status ); - for (i = 0; i < nnzToRecv[p]; ++i) { - ia[nnz_loc] = itemp[i]; - jcol = itemp[i + nnzToRecv[p]]; - /*assert(jcolcomm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /*assert(jcol 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + if ( SendCnt ) { + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + } + SUPERLU_FREE(ptr_to_send); + if ( maxnnzToRecv ) { + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + } + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. + ------------------------------------------------------------*/ + if ( nnz_loc ) { /* nnz_loc can be zero */ + if ( !(*rowind = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *rowind[]."); + if ( !(*a = doubleMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *a[]."); + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = (*colptr)[0]; + (*colptr)[0] = 0; + for (j = 1; j < n; ++j) { + k += jsize; + jsize = (*colptr)[j]; + (*colptr)[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + j = ja[i]; + k = (*colptr)[j]; + (*rowind)[k] = ia[i]; + (*a)[k] = aij[i]; + ++(*colptr)[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; + (*colptr)[0] = 0; + + if ( nnz_loc ) { + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); } - } - - /* ------------------------------------------------------------ - DEALLOCATE TEMPORARY STORAGE - ------------------------------------------------------------*/ - - SUPERLU_FREE(nnzToRecv); - - if ( procs > 1 ) { - SUPERLU_FREE(send_req); - SUPERLU_FREE(ia_send); - SUPERLU_FREE(aij_send); - if ( SendCnt ) { - SUPERLU_FREE(index); - SUPERLU_FREE(nzval); - } - SUPERLU_FREE(ptr_to_send); - if ( maxnnzToRecv ) { - SUPERLU_FREE(itemp); - SUPERLU_FREE(dtemp); - } - } - - /* ------------------------------------------------------------ - CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. - ------------------------------------------------------------*/ - if ( nnz_loc ) { /* nnz_loc can be zero */ - if ( !(*rowind = intMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *rowind[]."); - if ( !(*a = doubleMalloc_dist(nnz_loc)) ) - ABORT("Malloc fails for *a[]."); - } - - /* Initialize the array of column pointers */ - k = 0; - jsize = (*colptr)[0]; - (*colptr)[0] = 0; - for (j = 1; j < n; ++j) { - k += jsize; - jsize = (*colptr)[j]; - (*colptr)[j] = k; - } - - /* Copy the triplets into the column oriented storage */ - for (i = 0; i < nnz_loc; ++i) { - j = ja[i]; - k = (*colptr)[j]; - (*rowind)[k] = ia[i]; - (*a)[k] = aij[i]; - ++(*colptr)[j]; - } - - /* Reset the column pointers to the beginning of each column */ - for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; - (*colptr)[0] = 0; - - if ( nnz_loc ) { - SUPERLU_FREE(ia); - SUPERLU_FREE(aij); - } #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit dReDistribute_A()"); + CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif - - return 0; + + return 0; } /* dReDistribute_A */ -float + float pddistribute(fact_t fact, int_t n, SuperMatrix *A, - ScalePermstruct_t *ScalePermstruct, - Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, - gridinfo_t *grid, int_t nrhs) -/* - * -- Distributed SuperLU routine (version 2.0) -- - * Lawrence Berkeley National Lab, Univ. of California Berkeley. - * March 15, 2003 - * - * - * Purpose - * ======= - * Distribute the matrix onto the 2D process mesh. - * - * Arguments - * ========= - * - * fact (input) fact_t - * Specifies whether or not the L and U structures will be re-used. - * = SamePattern_SameRowPerm: L and U structures are input, and - * unchanged on exit. - * = DOFACT or SamePattern: L and U structures are computed and output. - * - * n (input) int - * Dimension of the matrix. - * - * A (input) SuperMatrix* - * The distributed input matrix A of dimension (A->nrow, A->ncol). - * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: - * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. - * - * ScalePermstruct (input) ScalePermstruct_t* - * The data structure to store the scaling and permutation vectors - * describing the transformations performed to the original matrix A. - * - * Glu_freeable (input) *Glu_freeable_t - * The global structure describing the graph of L and U. - * - * LUstruct (input) LUstruct_t* - * Data structures for L and U factors. - * - * grid (input) gridinfo_t* - * The 2D process mesh. - * - * Return value - * ============ - * > 0, working storage required (in bytes). - * - */ + ScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, + gridinfo_t *grid, int_t nrhs) + /* + * -- Distributed SuperLU routine (version 2.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * March 15, 2003 + * + * + * Purpose + * ======= + * Distribute the matrix onto the 2D process mesh. + * + * Arguments + * ========= + * + * fact (input) fact_t + * Specifies whether or not the L and U structures will be re-used. + * = SamePattern_SameRowPerm: L and U structures are input, and + * unchanged on exit. + * = DOFACT or SamePattern: L and U structures are computed and output. + * + * n (input) int + * Dimension of the matrix. + * + * A (input) SuperMatrix* + * The distributed input matrix A of dimension (A->nrow, A->ncol). + * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: + * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. + * + * ScalePermstruct (input) ScalePermstruct_t* + * The data structure to store the scaling and permutation vectors + * describing the transformations performed to the original matrix A. + * + * Glu_freeable (input) *Glu_freeable_t + * The global structure describing the graph of L and U. + * + * LUstruct (input) LUstruct_t* + * Data structures for L and U factors. + * + * grid (input) gridinfo_t* + * The 2D process mesh. + * + * Return value + * ============ + * > 0, working storage required (in bytes). + * + */ { - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, - len, len1, nsupc; - int_t lib; /* local block row number */ - int_t nlb; /* local block rows*/ - int_t ljb; /* local block column number */ - int_t nrbl; /* number of L blocks in current block column */ - int_t nrbu; /* number of U blocks in current block column */ - int_t gb; /* global block number; 0 < gb <= nsuper */ - int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, kcol,krow, mycol, myrow, pc, pr; - int_t mybufmax[NBUFFERS]; - NRformat_loc *Astore; - double *a; - int_t *asub, *xa; - int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ - int_t *supno = Glu_persist->supno; - int_t *lsub, *xlsub, *usub, *xusub; - int_t nsupers; - int_t next_lind; /* next available position in index[*] */ - int_t next_lval; /* next available position in nzval[*] */ - int_t *index; /* indices consist of headers and row subscripts */ - int_t *index_srt; /* indices consist of headers and row subscripts */ - int *index1; /* temporary pointer to array of int */ - double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int iam, jbrow, kcol,krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + double *a; + int_t *asub, *xa; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ - int_t msgsize; + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - /*-- Counts to be used in factorization. --*/ - int *ToRecv, *ToSendD, **ToSendR; - - /*-- Counts to be used in lower triangular solve. --*/ - int_t *fmod; /* Modification count for L-solve. */ - int_t **fsendx_plist; /* Column process list to send down Xk. */ - int_t nfrecvx = 0; /* Number of Xk I will receive. */ - int_t nfsendx = 0; /* Number of Xk I will send */ - int_t kseen; - - /*-- Counts to be used in upper triangular solve. --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist; /* Column process list to send down Xk. */ - int_t nbrecvx = 0; /* Number of Xk I will receive. */ - int_t nbsendx = 0; /* Number of Xk I will send */ - int_t *ilsum; /* starting position of each supernode in - the full array (local) */ - - /*-- Auxiliary arrays; freed on return --*/ - int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ - int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ - int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ - int_t *Ucbs; /* number of column blocks in a block row */ - int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ - int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ - int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ - int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ - int_t *ActiveFlag; - int Iactive; - int_t *ranks; - int_t *idxs; - int_t **nzrows; - double rseed; - int_t rank_cnt,rank_cnt_ref,Root; + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; - float mem_use = 0.0; + double zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; int_t *mod_bit; - int_t *frecv, *lloc; + int_t *frecv, *brecv, *lloc; double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double *SeedSTD_BC,*SeedSTD_RD; int_t idx_indx,idx_lusup; - int nbrow; - int_t ik, il, lk, rel, knsupc, idx_r; - int_t lptr1_tmp, idx_i, idx_v,m, uu; - + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; + int_t nub; + int tag; #if ( PRNTlevel>=1 ) - int_t nLblocks = 0, nUblocks = 0; + int_t nLblocks = 0, nUblocks = 0; #endif #if ( PROFlevel>=1 ) - double t, t_u, t_l; - int_t u_blks; + double t, t_u, t_l; + int_t u_blks; #endif - /* Initialization. */ - iam = grid->iam; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; - nsupers = supno[n-1] + 1; - Astore = (NRformat_loc *) A->Store; + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = (NRformat_loc *) A->Store; -#if ( PRNTlevel>=1 ) - iword = sizeof(int_t); - dword = sizeof(double); -#endif + // #if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + dword = sizeof(double); + + aln_i = ceil(CACHELINE/(double)iword); + + // #endif #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pddistribute()"); + CHECK_MALLOC(iam, "Enter pddistribute()"); #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, - grid, &xa, &asub, &a); + dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, + grid, &xa, &asub, &a); #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf("--------\n" - ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam ) printf("--------\n" + ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); #endif - if ( fact == SamePattern_SameRowPerm ) { + if ( fact == SamePattern_SameRowPerm ) { #if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; + t_l = t_u = 0; u_blks = 0; #endif - /* We can propagate the new values of A into the existing - L and U data structures. */ - ilsum = Llu->ilsum; - ldaspa = Llu->ldalsum; - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ - if ( !(Urb_length = intCalloc_dist(nrbu)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) - ABORT("Malloc fails for Urb_indptr[]."); - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; #if ( PRNTlevel>=1 ) - mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; #endif #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* Initialize Uval to zero. */ - for (lb = 0; lb < nrbu; ++lb) { - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - index = Ufstnz_br_ptr[lb]; - if ( index ) { - uval = Unzval_br_ptr[lb]; - len = index[1]; - for (i = 0; i < len; ++i) uval[i] = zero; - } /* if index != NULL */ - } /* for lb ... */ - - for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - - /* Scatter A into SPA (for L), or into U directly. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - if ( gb < jb ) { /* in U */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - while ( (k = index[Urb_indptr[lb]]) < jb ) { - /* Skip nonzero values in this block */ - Urb_length[lb] += index[Urb_indptr[lb]+1]; - /* Move pointer to the next block */ - Urb_indptr[lb] += UB_DESCRIPTOR - + SuperSize( k ); - } - /*assert(k == jb);*/ - /* start fstnz */ - istart = Urb_indptr[lb] + UB_DESCRIPTOR; - len = Urb_length[lb]; - fsupc1 = FstBlockC( gb+1 ); - k = j - fsupc; - /* Sum the lengths of the leading columns */ - for (jj = 0; jj < k; ++jj) - len += fsupc1 - index[istart++]; - /*assert(irow>=index[istart]);*/ - uval[len + irow - index[istart]] = a[i]; - } else { /* in L; put in SPA first */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ #if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); #endif - /* Gather the values of A from SPA into Lnzval[]. */ - ljb = LBj( jb, grid ); /* Local block number */ - index = Lrowind_bc_ptr[ljb]; - if ( index ) { - nrbl = index[0]; /* Number of row blocks. */ - len = index[1]; /* LDA of lusup[]. */ - lusup = Lnzval_bc_ptr[ljb]; - next_lind = BC_HEADER; - next_lval = 0; - for (jj = 0; jj < nrbl; ++jj) { - gb = index[next_lind++]; - len1 = index[next_lind++]; /* Rows in the block. */ - lb = LBi( gb, grid ); - for (bnnz = 0; bnnz < len1; ++bnnz) { - irow = index[next_lind++]; /* Global index. */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - k = next_lval++; - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } - } /* for bnnz ... */ - } /* for jj ... */ - } /* if index ... */ + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ #if ( PROFlevel>=1 ) - t_l += SuperLU_timer_() - t; + t_l += SuperLU_timer_() - t; #endif - } /* if mycol == pc */ - } /* for jb ... */ + } /* if mycol == pc */ + } /* for jb ... */ - SUPERLU_FREE(dense); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); #if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", - t_l, t_u, u_blks, nrbu); + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); #endif - } else { - /* ------------------------------------------------------------ - FIRST TIME CREATING THE L AND U DATA STRUCTURES. - ------------------------------------------------------------*/ + } else { + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ #if ( PROFlevel>=1 ) - t_l = t_u = 0; u_blks = 0; + t_l = t_u = 0; u_blks = 0; #endif - /* We first need to set up the L and U data structures and then - * propagate the values of A into them. - */ - lsub = Glu_freeable->lsub; /* compressed L subscripts */ - xlsub = Glu_freeable->xlsub; - usub = Glu_freeable->usub; /* compressed U subscripts */ - xusub = Glu_freeable->xusub; - - if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) - ABORT("Malloc fails for ToRecv[]."); - for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; - - k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ - if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) - ABORT("Malloc fails for ToSendR[]."); - j = k * grid->npcol; - if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) - ABORT("Malloc fails for index[]."); + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); #if ( PRNTlevel>=1 ) - mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; #endif - for (i = 0; i < j; ++i) index1[i] = EMPTY; - for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; - k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - - /* Pointers to the beginning of each block row of U. */ - if ( !(Unzval_br_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Unzval_br_ptr[]."); - if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Ufstnz_br_ptr[]."); - - if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) - ABORT("Malloc fails for ToSendD[]."); - for (i = 0; i < k; ++i) ToSendD[i] = NO; - if ( !(ilsum = intMalloc_dist(k+1)) ) - ABORT("Malloc fails for ilsum[]."); - - /* Auxiliary arrays used to set up U block data structures. - They are freed on return. */ - if ( !(rb_marker = intCalloc_dist(k)) ) - ABORT("Calloc fails for rb_marker[]."); - if ( !(Urb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_length[]."); - if ( !(Urb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Urb_indptr[]."); - if ( !(Urb_fstnz = intCalloc_dist(k)) ) - ABORT("Calloc fails for Urb_fstnz[]."); - if ( !(Ucbs = intCalloc_dist(k)) ) - ABORT("Calloc fails for Ucbs[]."); + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); #if ( PRNTlevel>=1 ) - mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; + mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; #endif - /* Compute ldaspa and ilsum[]. */ - ldaspa = 0; - ilsum[0] = 0; - for (gb = 0; gb < nsupers; ++gb) { - if ( myrow == PROW( gb, grid ) ) { - i = SuperSize( gb ); - ldaspa += i; - lb = LBi( gb, grid ); - ilsum[lb + 1] = ilsum[lb] + i; - } - } - + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - /* ------------------------------------------------------------ - COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). - ------------------------------------------------------------*/ - - /* Loop through each supernode column. */ - for (jb = 0; jb < nsupers; ++jb) { - pc = PCOL( jb, grid ); - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - /* Loop through each column in the block. */ - for (j = fsupc; j < fsupc + nsupc; ++j) { - /* usub[*] contains only "first nonzero" in each segment. */ - for (i = xusub[j]; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero of the segment. */ - gb = BlockNum( irow ); - kcol = PCOL( gb, grid ); - ljb = LBj( gb, grid ); - if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; - pr = PROW( gb, grid ); - lb = LBi( gb, grid ); - if ( mycol == pc ) { - if ( myrow == pr ) { - ToSendD[lb] = YES; - /* Count nonzeros in entire block row. */ - Urb_length[lb] += FstBlockC( gb+1 ) - irow; - if (rb_marker[lb] <= jb) {/* First see the block */ - rb_marker[lb] = jb + 1; - Urb_fstnz[lb] += nsupc; - ++Ucbs[lb]; /* Number of column blocks - in block row lb. */ + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ #if ( PRNTlevel>=1 ) - ++nUblocks; + ++nUblocks; #endif - } - ToRecv[gb] = 1; - } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ - } - } /* for i ... */ - } /* for j ... */ - } /* for jb ... */ - - /* Set up the initial pointers for each block row in U. */ - nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - for (lb = 0; lb < nrbu; ++lb) { - len = Urb_length[lb]; - rb_marker[lb] = 0; /* Reset block marker. */ - if ( len ) { - /* Add room for descriptors */ - len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1+1)) ) - ABORT("Malloc fails for Uindex[]."); - Ufstnz_br_ptr[lb] = index; - if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) - ABORT("Malloc fails for Unzval_br_ptr[*][]."); - mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); - mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); - index[0] = Ucbs[lb]; /* Number of column blocks */ - index[1] = len; /* Total length of nzval[] */ - index[2] = len1; /* Total length of index[] */ - index[len1] = -1; /* End marker */ - } else { - Ufstnz_br_ptr[lb] = NULL; - Unzval_br_ptr[lb] = NULL; - } - Urb_length[lb] = 0; /* Reset block length. */ - Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ - Urb_fstnz[lb] = BR_HEADER; - } /* for lb ... */ - - SUPERLU_FREE(Ucbs); + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); #if ( PROFlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); #endif #if ( PRNTlevel>=1 ) - mem_use -= 2.0*k * iword; + mem_use -= 2.0*k * iword; #endif - /* Auxiliary arrays used to set up L block data structures. - They are freed on return. - k is the number of local row blocks. */ - if ( !(Lrb_length = intCalloc_dist(k)) ) - ABORT("Calloc fails for Lrb_length[]."); - if ( !(Lrb_number = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_number[]."); - if ( !(Lrb_indptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_indptr[]."); - if ( !(Lrb_valptr = intMalloc_dist(k)) ) - ABORT("Malloc fails for Lrb_valptr[]."); - if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) - ABORT("Calloc fails for SPA dense[]."); - - /* These counts will be used for triangular solves. */ - if ( !(fmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for fmod[]."); - if ( !(bmod = intCalloc_dist(k)) ) - ABORT("Calloc fails for bmod[]."); - - /* ------------------------------------------------ */ + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); + + /* ------------------------------------------------ */ #if ( PRNTlevel>=1 ) - mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; #endif - k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ - - /* Pointers to the beginning of each block column of L. */ - if ( !(Lnzval_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) - ABORT("Malloc fails for Lnzval_bc_ptr[]."); - if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_ptr[]."); - Lrowind_bc_ptr[k-1] = NULL; - if ( !(Lindval_loc_bc_ptr = - (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) - ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); - Lindval_loc_bc_ptr[k-1] = NULL; - - - - - if ( !(Linv_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); - } - if ( !(Uinv_bc_ptr = - (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); - } - Linv_bc_ptr[k-1] = NULL; - Uinv_bc_ptr[k-1] = NULL; - - /* These lists of processes will be used for triangular solves. */ - if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for fsendx_plist[]."); - len = k * grid->nprow; - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for fsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - fsendx_plist[i] = &index[j]; - if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) - ABORT("Malloc fails for bsendx_plist[]."); - if ( !(index = intMalloc_dist(len)) ) - ABORT("Malloc fails for bsendx_plist[0]"); - for (i = 0; i < len; ++i) index[i] = EMPTY; - for (i = 0, j = 0; i < k; ++i, j += grid->nprow) - bsendx_plist[i] = &index[j]; - /* -------------------------------------------------------------- */ + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ #if ( PRNTlevel>=1 ) - mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; #endif - /*------------------------------------------------------------ - PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. - THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. - ------------------------------------------------------------*/ - - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - pc = PCOL( jb, grid ); - if ( mycol == pc ) { /* Block column jb in my process column */ - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - - /* Scatter A into SPA. */ - for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { - for (i = xa[j]; i < xa[j+1]; ++i) { - irow = asub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - irow = ilsum[lb] + irow - FstBlockC( gb ); - dense_col[irow] = a[i]; - } - } - dense_col += ldaspa; - } /* for j ... */ - - jbrow = PROW( jb, grid ); + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ - /*------------------------------------------------ - * SET UP U BLOCKS. - *------------------------------------------------*/ + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } /* for j ... */ + + jbrow = PROW( jb, grid ); + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - kseen = 0; - dense_col = dense; - /* Loop through each column in the block column. */ - for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j+1]; ++i) { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - bsendx_plist[ljb][pr] == EMPTY ) { - bsendx_plist[ljb][pr] = YES; - ++nbsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - index = Ufstnz_br_ptr[lb]; - uval = Unzval_br_ptr[lb]; - fsupc1 = FstBlockC( gb+1 ); - if (rb_marker[lb] <= jb) { /* First time see - the block */ - rb_marker[lb] = jb + 1; - Urb_indptr[lb] = Urb_fstnz[lb];; - index[Urb_indptr[lb]] = jb; /* Descriptor */ - Urb_indptr[lb] += UB_DESCRIPTOR; - /* Record the first location in index[] of the - next block */ - Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; - len = Urb_indptr[lb];/* Start fstnz in index */ - index[len-1] = 0; - for (k = 0; k < nsupc; ++k) - index[len+k] = fsupc1; - if ( gb != jb )/* Exclude diagonal block. */ - ++bmod[lb];/* Mod. count for back solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nbrecvx; - kseen = 1; - } - } else { /* Already saw the block */ - len = Urb_indptr[lb];/* Start fstnz in index */ - } - jj = j - fsupc; - index[len+jj] = irow; - /* Load the numerical values */ - k = fsupc1 - irow; /* No. of nonzeros in segment */ - index[len-1] += k; /* Increment block length in - Descriptor */ - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (ii = 0; ii < k; ++ii) { - uval[Urb_length[lb]++] = dense_col[irow + ii]; - dense_col[irow + ii] = zero; - } - } /* if myrow == pr ... */ - } /* for i ... */ - dense_col += ldaspa; - } /* for j ... */ + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + // if(ljb==0){ + // printf("no here??\n"); + // fflush(stdout); + // } + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ #if ( PROFlevel>=1 ) - t_u += SuperLU_timer_() - t; - t = SuperLU_timer_(); + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); #endif - /*------------------------------------------------ - * SET UP L BLOCKS. - *------------------------------------------------*/ - - /* Count number of blocks and length of each block. */ - nrbl = 0; - len = 0; /* Number of row subscripts I own. */ - kseen = 0; - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); /* Global block number */ - pr = PROW( gb, grid ); /* Process row owning this block */ - if ( pr != jbrow && - myrow == jbrow && /* diag. proc. owning jb */ - fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { - fsendx_plist[ljb][pr] = YES; - ++nfsendx; - } - if ( myrow == pr ) { - lb = LBi( gb, grid ); /* Local block number */ - if (rb_marker[lb] <= jb) { /* First see this block */ - rb_marker[lb] = jb + 1; - Lrb_length[lb] = 1; - Lrb_number[nrbl++] = gb; - // if(gb==747)printf("worita %5d%5d",iam,jb); - if ( gb != jb ) /* Exclude diagonal block. */ - ++fmod[lb]; /* Mod. count for forward solve */ - if ( kseen == 0 && myrow != jbrow ) { - ++nfrecvx; - kseen = 1; - } + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + // if(gb==747)printf("worita %5d%5d",iam,jb); + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } #if ( PRNTlevel>=1 ) - ++nLblocks; + ++nLblocks; #endif - } else { - ++Lrb_length[lb]; - } - ++len; - } - } /* for i ... */ - - if ( nrbl ) { /* Do not ensure the blocks are sorted! */ - /* Set up the initial pointers for each block in - index[] and nzval[]. */ - /* Add room for descriptors */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - if ( !(index = intMalloc_dist(len1)) ) - ABORT("Malloc fails for index[]"); - if (!(lusup = - doubleMalloc_dist(len*nsupc))) { - fprintf(stderr, "col block " IFMT " ", jb); - ABORT("Malloc fails for lusup[]"); - } - if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) - ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); - - - if (!(Linv_bc_ptr[ljb] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Linv_bc_ptr[*][] col block " IFMT, jb); - } - if (!(Uinv_bc_ptr[ljb] = - doubleCalloc_dist(nsupc*nsupc))) { - fprintf(stderr, "Malloc fails for Uinv_bc_ptr[*][] col block " IFMT, jb); - } + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = + doubleMalloc_dist(len*nsupc))) { + fprintf(stderr, "col block " IFMT " ", jb); + ABORT("Malloc fails for lusup[]"); + } + // if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + + + + + if (!(Linv_bc_ptr[ljb] = + doubleCalloc_dist(nsupc*nsupc))) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[*][] col block " IFMT, jb); + } + if (!(Uinv_bc_ptr[ljb] = + doubleCalloc_dist(nsupc*nsupc))) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[*][] col block " IFMT, jb); + } + + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + + + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + + // if(ljb==0){ + // printf("lb %5d, ind %5d, val %5d\n",lb,next_lind,next_lval); + // fflush(stdout); + // } + + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + + + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = doubleMalloc_dist(len*nsupc))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ - mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); - mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); - mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); - index[0] = nrbl; /* Number of row blocks */ - index[1] = len; /* LDA of the nzval[] */ - next_lind = BC_HEADER; - next_lval = 0; - for (k = 0; k < nrbl; ++k) { - gb = Lrb_number[k]; - lb = LBi( gb, grid ); - len = Lrb_length[lb]; - - - Lindval_loc_bc_ptr[ljb][k] = lb; - Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; - Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; - - // if(ljb==0){ - // printf("lb %5d, ind %5d, val %5d\n",lb,next_lind,next_lval); + } /* for jb ... */ + + // for (j=0;j<19*3;j++){ + // printf("Lindval %5d\n",Lindval_loc_bc_ptr[0][j]); // fflush(stdout); // } + - Lrb_length[lb] = 0; /* Reset vector of block length */ - index[next_lind++] = gb; /* Descriptor */ - index[next_lind++] = len; - Lrb_indptr[lb] = next_lind; - Lrb_valptr[lb] = next_lval; - next_lind += len; - next_lval += len; - } - + ///////////////////////////////////////////////////////////////// - /* Propagate the compressed row subscripts to Lindex[], - and the initial values of A from SPA into Lnzval[]. */ - len = index[1]; /* LDA of lusup[] */ - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - if ( myrow == PROW( gb, grid ) ) { - lb = LBi( gb, grid ); - k = Lrb_indptr[lb]++; /* Random access a block */ - index[k] = irow; - k = Lrb_valptr[lb]++; - irow = ilsum[lb] + irow - FstBlockC( gb ); - for (j = 0, dense_col = dense; j < nsupc; ++j) { - lusup[k] = dense_col[irow]; - dense_col[irow] = zero; - k += len; - dense_col += ldaspa; - } + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } } - } /* for i ... */ - - Lrowind_bc_ptr[ljb] = index; - Lnzval_bc_ptr[ljb] = lusup; - - /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ - if(nrbl>1){ - krow = PROW( jb, grid ); - if(myrow==krow){ /* skip the diagonal block */ - uu=nrbl-2; - lloc = &Lindval_loc_bc_ptr[ljb][1]; - }else{ - uu=nrbl-1; - lloc = Lindval_loc_bc_ptr[ljb]; - } - quickSortM(lloc,0,uu,nrbl,0,3); + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + ///////////////////////////////////////////////////////////////// + + if(LSUM=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ - Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow; + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } - for (jj=0;jjnprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb]); + BcTree_SetTag(LBtree_ptr[ljb],BC_L); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb])); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) - t_l += SuperLU_timer_() - t; -#endif - } /* if mycol == pc */ + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + - } /* for jb ... */ +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - // for (j=0;j<19*3;j++){ - // printf("Lindval %5d\n",Lindval_loc_bc_ptr[0][j]); - // fflush(stdout); - // } - - ///////////////////////////////////////////////////////////////// - - if(LSUMnpcol );/* Number of local block columns */ - if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - ABORT("Malloc fails for LBtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = intCalloc_dist(grid->nprow)) ) - ABORT("Calloc fails for ranks[]."); - if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) - ABORT("Malloc fails for SeedSTD_BC[]."); - - for (i=0;icscp.comm); - - for (ljb = 0; ljb nprow;++j)ActiveFlag[j]=3*nsupers; - for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; - for (j=0;jnprow;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - fsupc = FstBlockC( jb ); - nsupc = SuperSize( jb ); - ljb = LBj( jb, grid ); /* Local block number */ - LBtree_ptr[ljb]=NULL; - - - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc+1]; ++i) { - irow = lsub[i]; - gb = BlockNum( irow ); - pr = PROW( gb, grid ); - ActiveFlag[pr]=MIN(ActiveFlag[pr],gb); - if(gb==jb)Root=pr; - if(myrow==pr)Iactive=1; - - } /* for j ... */ - - quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); - - if(Iactive==1){ - - - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->nprow; ++j){ - if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; - ++rank_cnt; + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;i1){ - - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb]); - BcTree_SetTag(LBtree_ptr[ljb],jb); - - // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); - - // if(iam==15 || iam==3){ - // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb])); - // fflush(stdout); - // } - // TreeTest(LBtree_ptr[ljb]); - -// #if ( PRNTlevel>=1 ) - if(Root==myrow){ - rank_cnt_ref=1; - for (j = 0; j < grid->nprow; ++j) { - if ( fsendx_plist[ljb][j] != EMPTY ) { - ++rank_cnt_ref; + + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } } } - assert(rank_cnt==rank_cnt_ref); - - // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + - // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); - // // for(j=0;jnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib]); + RdTree_SetTag(LRtree_ptr[lib], RD_L); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib])); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;jnprow );/* Number of local block rows */ - if ( !(mod_bit = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for mod_bit[]."); - if ( !(frecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - pr = PROW( k, grid ); - if ( myrow == pr ) { - lib = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if (mycol == kcol || fmod[lib] ) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } - } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - - - - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - ABORT("Malloc fails for LRtree_ptr[]."); - if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) - ABORT("Calloc fails for ActiveFlag[]."); - if ( !(ranks = intCalloc_dist(grid->npcol)) ) - ABORT("Calloc fails for ranks[]."); + SUPERLU_FREE(mod_bit); + SUPERLU_FREE(frecv); - if ( !(idxs = intCalloc_dist(nsupers)) ) - ABORT("Calloc fails for idxs[]."); - if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - ABORT("Malloc fails for nzrows[]."); + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + // SUPERLU_FREE(idxs); + SUPERLU_FREE(SeedSTD_RD); + // for(i=0;irscp.comm); - - - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - fsupc = FstBlockC( jb ); - len=xlsub[fsupc+1]-xlsub[fsupc]; - idxs[jb] = len-1; - if(len>0){ - if ( !(nzrows[jb] = intMalloc_dist(len)) ) - ABORT("Malloc fails for nzrows[jb]"); - for(i=xlsub[fsupc];i=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;i0 ; --ib) { - pr = PROW( ib, grid ); - if ( myrow == pr ) { /* Block row ib in my process row */ - - for (j=0;jnpcol;++j)ActiveFlag[j]=-3*nsupers; - for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; - for (j=0;jnpcol;++j)ranks[j]=-1; - Root=-1; - Iactive = 0; - lib = LBi( ib, grid ); /* Local block number */ - LRtree_ptr[lib]=NULL; + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_BC[0],k,MPI_DOUBLE,MPI_MAX,grid->cscp.comm); - for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - fsupc = FstBlockC( jb ); - if(idxs[jb]>=0){ /* if column jb has not been iterated through */ - irow = nzrows[jb][idxs[jb]]; - gb = BlockNum( irow ); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; - while(gb>=ib){ - if(gb==ib){ /* (ib,jb) nonempty*/ - pc = PCOL( jb, grid ); - ActiveFlag[pc]=MAX(ActiveFlag[pc],jb); - if(ib==jb)Root=pc; - if(mycol==pc)Iactive=1; - } - if(idxs[jb]-1>=0){ - --idxs[jb]; - irow = nzrows[jb][idxs[jb]]; - gb = BlockNum( irow );} - else{break;} + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb]); + BcTree_SetTag(UBtree_ptr[ljb],BC_U); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;inpcol-1,grid->npcol,1,2); - - if(Iactive==1){ - assert( Root>-1 ); - rank_cnt = 1; - ranks[0]=Root; - for (j = 0; j < grid->npcol; ++j){ - if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ - ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; - ++rank_cnt; + + MPI_Allreduce(MPI_IN_PLACE,&SeedSTD_RD[0],k,MPI_DOUBLE,MPI_MAX,grid->rscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib 1){ + - for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib]); - RdTree_SetTag(LRtree_ptr[lib], ib+nsupers); - // } - - // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); - // fflush(stdout); - - // if(ib==15 || ib ==16){ - - // if(iam==15 || iam==3){ - // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib])); - // fflush(stdout); - // } - - -// #if ( PRNTlevel>=1 ) - // if(Root==mycol){ - // assert(rank_cnt==frecv[lib]); - // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // // for(j=0;jnpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib]); + RdTree_SetTag(URtree_ptr[lib], RD_U); + // } - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - Llu->LRtree_ptr = LRtree_ptr; - Llu->LBtree_ptr = LBtree_ptr; - Llu->Linv_bc_ptr = Linv_bc_ptr; - Llu->Uinv_bc_ptr = Uinv_bc_ptr; + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + #if ( PRNTlevel>=1 ) - if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", - nLblocks, nUblocks); + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); #endif - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); - k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - if ( !(Llu->mod_bit = intMalloc_dist(k)) ) - ABORT("Malloc fails for mod_bit[]."); + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); #if ( PROFlevel>=1 ) - if ( !iam ) printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); #endif - } /* else fact != SamePattern_SameRowPerm */ + } /* else fact != SamePattern_SameRowPerm */ - if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ - SUPERLU_FREE(asub); - SUPERLU_FREE(a); - } - SUPERLU_FREE(xa); + if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ + SUPERLU_FREE(asub); + SUPERLU_FREE(a); + } + SUPERLU_FREE(xa); #if ( DEBUGlevel>=1 ) - /* Memory allocated but not freed: - ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit pddistribute()"); + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit pddistribute()"); #endif - - return (mem_use); -} /* PDDISTRIBUTE */ + + return (mem_use); + } /* PDDISTRIBUTE */ diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index ad60a353..c5287d72 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -923,6 +923,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, #endif } + + /* ------------------------------------------------------------ Perform the LU factorization: symbolic factorization, redistribution, and numerical factorization. @@ -974,7 +976,15 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( permc_spec != MY_PERMC && Fact == DOFACT ) { /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */ if ( permc_spec == PARMETIS ) { - /* Get column permutation vector in perm_c. * + + + // #pragma omp parallel + // { + // #pragma omp master + // { + + + /* Get column permutation vector in perm_c. * * This routine takes as input the distributed input matrix A * * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * @@ -982,6 +992,9 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, noDomains, &sizes, &fstVtxSep, grid, &symb_comm); + // } + // } + if (flinfo > 0) { #if ( PRNTlevel>=1 ) fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n"); @@ -1132,8 +1145,20 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); + + + + + // #pragma omp parallel + // { + // #pragma omp master + // { + pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; + stat->utime[FACT] = SuperLU_timer_() - t; + // } + // } + #if 0 @@ -1304,25 +1329,30 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, For repeated call to pdgssvx(), no need to re-initialilze the Solve data & communication structures, unless a new factorization with Fact == DOFACT or SamePattern is asked for. */ + if(options->DiagInv==YES){ + + #ifdef _CRAY + blas_flag=1; + #elif defined (USE_VENDOR_BLAS) + blas_flag=2; + #else + blas_flag=0; + #endif + if(blas_flag==0) + ABORT("DiagInv doesn't works with internal blas\n"); + pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); + } } - - if(options->DiagInv==YES){ - -#ifdef _CRAY - blas_flag=1; -#elif defined (USE_VENDOR_BLAS) - blas_flag=2; -#else - blas_flag=0; -#endif - if(blas_flag==0) - ABORT("DiagInv doesn't works with internal blas\n"); - pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); - } - + // #pragma omp parallel + // { + // #pragma omp master + // { pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + // } + // } + /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c index abd6e0a1..f00c77f5 100644 --- a/SRC/pdgstrs.c +++ b/SRC/pdgstrs.c @@ -1,13 +1,13 @@ /*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) + Copyright (c) 2003, The Regents of the University of California, through + Lawrence Berkeley National Laboratory (subject to receipt of any required + approvals from U.S. Dept. of Energy) -All rights reserved. + All rights reserved. -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ + The source code is distributed under BSD license, see the file License.txt + at the top-level directory. + */ /*! @file @@ -23,6 +23,10 @@ at the top-level directory. #include #include "superlu_ddefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + /* * Sketch of the algorithm for L-solve: * ======================= @@ -79,7 +83,7 @@ at the top-level directory. * | | | | | | * --------- <---------------| */ - + /*#define ISEND_IRECV*/ /* @@ -87,7 +91,7 @@ at the top-level directory. */ #ifdef _CRAY fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); + double*, int*, double*, int*); _fcd ftcs1; _fcd ftcs2; _fcd ftcs3; @@ -146,200 +150,200 @@ _fcd ftcs3; * */ -int_t + int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, - int_t fst_row, int_t *ilsum, double *x, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, - gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) + int_t fst_row, int_t *ilsum, double *x, + ScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, + gridinfo_t *grid, SOLVEstruct_t *SOLVEstruct) { - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *perm_r, *perm_c; /* row and column permutation vectors */ - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *xsup, *supno; - int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk; - int p, procs; - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *perm_r, *perm_c; /* row and column permutation vectors */ + int_t *send_ibuf, *recv_ibuf; + double *send_dbuf, *recv_dbuf; + int_t *xsup, *supno; + int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; + int p, procs; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; MPI_Request req_i, req_d, *req_send, *req_recv; MPI_Status status, *status_send, *status_recv; int Nreq_recv, Nreq_send, pp; - + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()"); + CHECK_MALLOC(grid->iam, "Enter pdReDistribute_B_to_X()"); #endif - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - procs = grid->nprow * grid->npcol; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - SendCnt = gstrs_comm->B_to_X_SendCnt; - SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; - RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; - sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; - rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - /* ------------------------------------------------------------ - NOW COMMUNICATE THE ACTUAL DATA. - ------------------------------------------------------------*/ - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - recv_dbuf = send_dbuf + k * nrhs; - if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) - ABORT("Malloc fails for req_send[]."); - if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) - ABORT("Malloc fails for req_recv[]."); - if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) - ABORT("Malloc fails for status_send[]."); - if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) - ABORT("Malloc fails for status_recv[]."); - - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls[p] * nrhs; - } - - /* Copy the row indices and values to the send buffer. */ - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - k = ptr_to_ibuf[p]; - send_ibuf[k] = irow; - k = ptr_to_dbuf[p]; - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - send_dbuf[k++] = B[i + j*ldb]; + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + SendCnt = gstrs_comm->B_to_X_SendCnt; + SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; + RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; + sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; + rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + /* ------------------------------------------------------------ + NOW COMMUNICATE THE ACTUAL DATA. + ------------------------------------------------------------*/ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = doubleMalloc_dist((k + l)* (size_t)nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls[p] * nrhs; } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; - } - + /* Copy the row indices and values to the send buffer. */ + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + k = ptr_to_ibuf[p]; + send_ibuf[k] = irow; + k = ptr_to_dbuf[p]; + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + send_dbuf[k++] = B[i + j*ldb]; + } + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; + } + + #if 1 - /* Communicate the (permuted) row indices. */ - MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - - /* Communicate the numerical values. */ - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); - + /* Communicate the (permuted) row indices. */ + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + + /* Communicate the numerical values. */ + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); + #else - - /* Communicate the (permuted) row indices. */ - MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i); - - /* Communicate the numerical values. */ - MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm, &req_d); + + /* Communicate the (permuted) row indices. */ + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i); + + /* Communicate the numerical values. */ + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm, &req_d); MPI_Wait(&req_i,&status); MPI_Wait(&req_d,&status); - + #endif -// MPI_Barrier( grid->comm ); + // MPI_Barrier( grid->comm ); -// Nreq_send=0; -// for (pp=0;pp0){ - // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; + // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, + // &req_send[Nreq_send] ); + // Nreq_send++; + // } // } -// } -// Nreq_recv=0; -// for (pp=0;pp0){ - // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; + // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, + // &req_recv[Nreq_recv] ); + // Nreq_recv++; + // } // } -// } -// if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); -// if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); -// Nreq_send=0; -// for (pp=0;pp0){ - // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; + // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, + // &req_send[Nreq_send] ); + // Nreq_send++; + // } // } -// } -// Nreq_recv=0; -// for (pp=0;pp0){ - // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; + // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, + // &req_recv[Nreq_recv] ); + // Nreq_recv++; // } -// } - -// if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); -// if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); - - - - /* ------------------------------------------------------------ - Copy buffer into X on the diagonal processes. - ------------------------------------------------------------*/ - ii = 0; - for (p = 0; p < procs; ++p) { - jj = rdispls_nrhs[p]; - for (i = 0; i < RecvCnt[p]; ++i) { - /* Only the diagonal processes do this; the off-diagonal processes - have 0 RecvCnt. */ - irow = recv_ibuf[ii]; /* The permuted row index. */ - k = BlockNum( irow ); - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number. */ - l = X_BLK( lk ); - x[l - XK_H] = k; /* Block number prepended in the header. */ - irow = irow - FstBlockC(k); /* Relative row number in X-block */ - RHS_ITERATE(j) { - x[l + irow + j*knsupc] = recv_dbuf[jj++]; - } - ++ii; + // } + + // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + + + + /* ------------------------------------------------------------ + Copy buffer into X on the diagonal processes. + ------------------------------------------------------------*/ + ii = 0; + for (p = 0; p < procs; ++p) { + jj = rdispls_nrhs[p]; + for (i = 0; i < RecvCnt[p]; ++i) { + /* Only the diagonal processes do this; the off-diagonal processes + have 0 RecvCnt. */ + irow = recv_ibuf[ii]; /* The permuted row index. */ + k = BlockNum( irow ); + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number. */ + l = X_BLK( lk ); + x[l - XK_H] = k; /* Block number prepended in the header. */ + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = recv_dbuf[jj++]; + } + ++ii; + } } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); - SUPERLU_FREE(req_send); - SUPERLU_FREE(req_recv); - SUPERLU_FREE(status_send); - SUPERLU_FREE(status_recv); - + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()"); + CHECK_MALLOC(grid->iam, "Exit pdReDistribute_B_to_X()"); #endif - return 0; + return 0; } /* pdReDistribute_B_to_X */ /*! \brief @@ -357,315 +361,315 @@ pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, * */ -int_t + int_t pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row, - int_t nrhs, double *x, int_t *ilsum, - ScalePermstruct_t *ScalePermstruct, - Glu_persist_t *Glu_persist, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) + int_t nrhs, double *x, int_t *ilsum, + ScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + SOLVEstruct_t *SOLVEstruct) { - int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; - int_t *xsup, *supno; - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; - int *ptr_to_ibuf, *ptr_to_dbuf; - int_t *send_ibuf, *recv_ibuf; - double *send_dbuf, *recv_dbuf; - int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; - int iam, p, q, pkk, procs; - int_t num_diag_procs, *diag_procs; + int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; + int_t *xsup, *supno; + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + double *send_dbuf, *recv_dbuf; + int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int iam, p, q, pkk, procs; + int_t num_diag_procs, *diag_procs; MPI_Request req_i, req_d, *req_send, *req_recv; MPI_Status status, *status_send, *status_recv; int Nreq_recv, Nreq_send, pp; - + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()"); + CHECK_MALLOC(grid->iam, "Enter pdReDistribute_X_to_B()"); #endif - /* ------------------------------------------------------------ - INITIALIZATION. - ------------------------------------------------------------*/ - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; - iam = grid->iam; - procs = grid->nprow * grid->npcol; - - SendCnt = gstrs_comm->X_to_B_SendCnt; - SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; - RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; - RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; - sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; - sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; - rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; - rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; - ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; - ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; - - k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ - l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ - if ( !(send_ibuf = intMalloc_dist(k + l)) ) - ABORT("Malloc fails for send_ibuf[]."); - recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) - ABORT("Malloc fails for send_dbuf[]."); - if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) - ABORT("Malloc fails for req_send[]."); - if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) - ABORT("Malloc fails for req_recv[]."); - if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) - ABORT("Malloc fails for status_send[]."); - if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) - ABORT("Malloc fails for status_recv[]."); - - - recv_dbuf = send_dbuf + k * nrhs; - for (p = 0; p < procs; ++p) { - ptr_to_ibuf[p] = sdispls[p]; - ptr_to_dbuf[p] = sdispls_nrhs[p]; - } - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number */ - irow = FstBlockC( k ); - l = X_BLK( lk ); - for (i = 0; i < knsupc; ++i) { + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + iam = grid->iam; + procs = grid->nprow * grid->npcol; + + SendCnt = gstrs_comm->X_to_B_SendCnt; + SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; + RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; + sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; + rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + + + recv_dbuf = send_dbuf + k * nrhs; + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls_nrhs[p]; + } + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { #if 0 - ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ + ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ #else - ii = irow; + ii = irow; #endif - q = row_to_proc[ii]; - jj = ptr_to_ibuf[q]; - send_ibuf[jj] = ii; - jj = ptr_to_dbuf[q]; - RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ - send_dbuf[jj++] = x[l + i + j*knsupc]; - } - ++ptr_to_ibuf[q]; - ptr_to_dbuf[q] += nrhs; - ++irow; + q = row_to_proc[ii]; + jj = ptr_to_ibuf[q]; + send_ibuf[jj] = ii; + jj = ptr_to_dbuf[q]; + RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ + send_dbuf[jj++] = x[l + i + j*knsupc]; + } + ++ptr_to_ibuf[q]; + ptr_to_dbuf[q] += nrhs; + ++irow; + } + } } - } } - } - - /* ------------------------------------------------------------ - COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. - ------------------------------------------------------------*/ - + + /* ------------------------------------------------------------ + COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. + ------------------------------------------------------------*/ + #if 1 - + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); - MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); #else - MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, - recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); - MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm,&req_d); - + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_DOUBLE, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm,&req_d); + MPI_Wait(&req_i,&status); MPI_Wait(&req_d,&status); #endif -// MPI_Barrier( grid->comm ); -// Nreq_send=0; -// for (pp=0;ppcomm ); + // Nreq_send=0; + // for (pp=0;pp0){ - // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; + // MPI_Isend(&send_ibuf[sdispls[pp]], SendCnt[pp], mpi_int_t, pp, 0, grid->comm, + // &req_send[Nreq_send] ); + // Nreq_send++; + // } // } -// } -// Nreq_recv=0; -// for (pp=0;pp0){ - // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; + // MPI_Irecv(&recv_ibuf[rdispls[pp]], RecvCnt[pp], mpi_int_t, pp, 0, grid->comm, + // &req_recv[Nreq_recv] ); + // Nreq_recv++; + // } // } -// } -// if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); -// if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); -// // MPI_Barrier( grid->comm ); + // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // // MPI_Barrier( grid->comm ); -// Nreq_send=0; -// for (pp=0;pp0){ - // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_send[Nreq_send] ); - // Nreq_send++; + // MPI_Isend(&send_dbuf[sdispls_nrhs[pp]], SendCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, + // &req_send[Nreq_send] ); + // Nreq_send++; + // } // } -// } -// Nreq_recv=0; -// for (pp=0;pp0){ - // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, - // &req_recv[Nreq_recv] ); - // Nreq_recv++; + // MPI_Irecv(&recv_dbuf[rdispls_nrhs[pp]], RecvCnt_nrhs[pp], MPI_DOUBLE, pp, 1, grid->comm, + // &req_recv[Nreq_recv] ); + // Nreq_recv++; + // } // } -// } -// if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); -// if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); -// // MPI_Barrier( grid->comm ); + // if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + // if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // // MPI_Barrier( grid->comm ); - /* ------------------------------------------------------------ - COPY THE BUFFER INTO B. - ------------------------------------------------------------*/ - for (i = 0, k = 0; i < m_loc; ++i) { - irow = recv_ibuf[i]; - irow -= fst_row; /* Relative row number */ - RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ - B[irow + j*ldb] = recv_dbuf[k++]; + /* ------------------------------------------------------------ + COPY THE BUFFER INTO B. + ------------------------------------------------------------*/ + for (i = 0, k = 0; i < m_loc; ++i) { + irow = recv_ibuf[i]; + irow -= fst_row; /* Relative row number */ + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow + j*ldb] = recv_dbuf[k++]; + } } - } - - SUPERLU_FREE(send_ibuf); - SUPERLU_FREE(send_dbuf); - SUPERLU_FREE(req_send); - SUPERLU_FREE(req_recv); - SUPERLU_FREE(status_send); - SUPERLU_FREE(status_recv); - + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()"); + CHECK_MALLOC(grid->iam, "Exit pdReDistribute_X_to_B()"); #endif - return 0; + return 0; } /* pdReDistribute_X_to_B */ -void + void pdCompute_Diag_Inv(int_t n, LUstruct_t *LUstruct,gridinfo_t *grid, SuperLUStat_t *stat, int *info) { - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - - double *lusup; - double *recvbuf, *tempv; - double *Linv;/* Inverse of diagonal block */ - double *Uinv;/* Inverse of diagonal block */ - - int_t kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; - int_t nb, nlb,nlb_nodiag, nub, nsupers; - int_t *xsup, *supno, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int Pc, Pr, iam; - int knsupc, nsupr; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - double **Linv_bc_ptr; - double **Uinv_bc_ptr; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + + double *lusup; + double *recvbuf, *tempv; + double *Linv;/* Inverse of diagonal block */ + double *Uinv;/* Inverse of diagonal block */ + + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb,nlb_nodiag, nub, nsupers; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + double **Lnzval_bc_ptr; + double **Linv_bc_ptr; + double **Uinv_bc_ptr; int INFO; double t; - + #if ( PROFlevel>=1 ) - t = SuperLU_timer_(); + t = SuperLU_timer_(); #endif - + // printf("wocao \n"); // fflush(stdout); if(grid->iam==0){ - printf("computing inverse of diagonal blocks...\n"); - fflush(stdout); + printf("computing inverse of diagonal blocks...\n"); + fflush(stdout); } - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Linv_bc_ptr = Llu->Linv_bc_ptr; - Uinv_bc_ptr = Llu->Uinv_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ Llu->inv = 1; - /*--------------------------------------------------- - * Compute inverse of L(lk,lk). - *---------------------------------------------------*/ + /*--------------------------------------------------- + * Compute inverse of L(lk,lk). + *---------------------------------------------------*/ for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); - if ( mycol == kcol ) { /* diagonal process */ - - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - Linv = Linv_bc_ptr[lk]; - Uinv = Uinv_bc_ptr[lk]; - nsupr = lsub[1]; - knsupc = SuperSize( k ); - - for (j=0 ; j=1 ) -if(grid->iam==0){ - t = SuperLU_timer_() - t; - printf(".. L-diag_inv time\t%10.5f\n", t); - fflush(stdout); -} + if(grid->iam==0){ + t = SuperLU_timer_() - t; + printf(".. L-diag_inv time\t%10.5f\n", t); + fflush(stdout); + } #endif - - return; + + return; } @@ -742,154 +746,178 @@ if(grid->iam==0){ * */ -void + void pdgstrs(int_t n, LUstruct_t *LUstruct, - ScalePermstruct_t *ScalePermstruct, - gridinfo_t *grid, double *B, - int_t m_loc, int_t fst_row, int_t ldb, int nrhs, - SOLVEstruct_t *SOLVEstruct, - SuperLUStat_t *stat, int *info) + ScalePermstruct_t *ScalePermstruct, + gridinfo_t *grid, double *B, + int_t m_loc, int_t fst_row, int_t ldb, int nrhs, + SOLVEstruct_t *SOLVEstruct, + SuperLUStat_t *stat, int *info) { - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - double alpha = 1.0; - double beta = 0.0; - double zero = 0.0; - double *lsum; /* Local running sum of the updates to B-components */ - double *x; /* X component at step k. */ - /* NOTE: x and lsum are of same size. */ - double *lusup, *dest; - double *recvbuf,*recvbuf_on, *tempv, *recvbufall, *recvbuf_BC_fwd, *recvbuf0; - double *rtemp; /* Result of full matrix-vector multiply. */ - double *Linv; /* Inverse of diagonal block */ - double *Uinv; /* Inverse of diagonal block */ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + double alpha = 1.0; + double beta = 0.0; + double zero = 0.0; + double *lsum; /* Local running sum of the updates to B-components */ + double *x; /* X component at step k. */ + /* NOTE: x and lsum are of same size. */ + double *lusup, *dest; + double *recvbuf,*recvbuf_on, *tempv, *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin; + double *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */ + double *Linv; /* Inverse of diagonal block */ + double *Uinv; /* Inverse of diagonal block */ int *ipiv; + int_t *leaf_send; + int_t nleaf_send, nleaf_send_tmp; + int_t *root_send; + int_t nroot_send, nroot_send_tmp; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; BcTree *LBtree_ptr = Llu->LBtree_ptr; RdTree *LRtree_ptr = Llu->LRtree_ptr; - int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ - Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t kcol, krow, mycol, myrow; - int_t i, ii, il, j, jj, k, lb, ljb, lk, lib, lptr, luptr, gb, nn; - int_t nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i; - int_t *xsup, *supno, *lsub, *usub; - int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ - int Pc, Pr, iam; - int knsupc, nsupr, nprobe; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + int_t *Urbs1, *Urbs2; /* Number of row blocks in each block column of U. */ + int_t *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr = Llu->Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, kk, lb, ljb, lk, lib, lptr, luptr, gb, nn; + int_t nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr, nprobe; int nbtree, nrtree, outcount; - int ldalsum; /* Number of lsum entries locally owned. */ - int maxrecvsz, p, pi; - int_t **Lrowind_bc_ptr; - double **Lnzval_bc_ptr; - double **Linv_bc_ptr; - double **Uinv_bc_ptr; - double sum; - MPI_Status status,status_on,statusx,statuslsum; - MPI_Request *send_req, recv_req, req; - pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + double **Lnzval_bc_ptr; + double **Linv_bc_ptr; + double **Uinv_bc_ptr; + double sum; + MPI_Status status,status_on,statusx,statuslsum; + MPI_Request *send_req, recv_req, req; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + SuperLUStat_t **stat_loc; double tmax; - - /*-- Counts used for L-solve --*/ - int_t *fmod; /* Modification count for L-solve -- - Count the number of local block products to - be summed into lsum[lk]. */ - int_t **fsendx_plist = Llu->fsendx_plist; - int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ - int_t nfrecvx_buf=0; - int_t *frecv; /* Count of lsum[lk] contributions to be received - from processes in this row. - It is only valid on the diagonal processes. */ - int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ - int_t nleaf = 0, nroot = 0; - int_t nleaftmp = 0, nroottmp = 0; + + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve -- + Count the number of local block products to + be summed into lsum[lk]. */ + int_t fmod_tmp; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t nfrecvx_buf=0; + int_t *frecv; /* Count of lsum[lk] contributions to be received + from processes in this row. + It is only valid on the diagonal processes. */ + int_t frecv_tmp; + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nfrecv = 0; /* Count of total messages to be recv'd. */ + int_t nbrecv = 0; /* Count of total messages to be recv'd. */ + int_t nleaf = 0, nroot = 0; + int_t nleaftmp = 0, nroottmp = 0; int_t msgsize; - /*-- Counts used for U-solve --*/ - int_t *bmod; /* Modification count for U-solve. */ - int_t **bsendx_plist = Llu->bsendx_plist; - int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ - int_t *brecv; /* Count of modifications to be recv'd from - processes in this row. */ - int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t bmod_tmp; + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t nbrecvx_buf=0; + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t flagx,flaglsum,flag; - int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups; - StdList LBList, LRList; + int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; int_t TAG; - double t1_sol, t2_sol, t; + double t1_sol, t2_sol, t; #if ( DEBUGlevel>=2 ) - int_t Ublocks = 0; + int_t Ublocks = 0; #endif - int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ - int INFO; - -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; -#endif + int_t gik,iklrow,fnz; + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + int INFO, pad; + int_t tmpresult; + + // #if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + // #endif + + int_t *msgcnt=(int_t *) SUPERLU_MALLOC(4 * sizeof(int_t)); /* Count the size of the message xfer'd in each buffer: + * 0 : transferred in Lsub_buf[] + * 1 : transferred in Lval_buf[] + * 2 : transferred in Usub_buf[] + * 3 : transferred in Uval_buf[] + */ + int iword = sizeof (int_t); + int dword = sizeof (double); + int Nwork; - int_t *msgcnt=(int_t *) SUPERLU_MALLOC(4 * sizeof(int_t)); /* Count the size of the message xfer'd in each buffer: - * 0 : transferred in Lsub_buf[] - * 1 : transferred in Lval_buf[] - * 2 : transferred in Usub_buf[] - * 3 : transferred in Uval_buf[] - */ - int iword = sizeof (int_t); - int dword = sizeof (double); - yes_no_t done; yes_no_t startforward; - + int nbrow; - int_t ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc; + int_t ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc; int_t lptr1_tmp, idx_i, idx_v,m; - - -int num_thread = 1; + + int_t thread_id,ready; + yes_no_t empty; + int_t sizelsum,sizertemp,aln_d,aln_i; + + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + + int num_thread = 1; #ifdef _OPENMP #pragma omp parallel default(shared) - { - if (omp_get_thread_num () == 0) { - num_thread = omp_get_num_threads (); - } - } + { + if (omp_get_thread_num () == 0) { + num_thread = omp_get_num_threads (); + } + } #endif -if(iam==0){ - printf("num_thread: %5d\n",num_thread); - fflush(stdout); -} - + if(grid->iam==0){ + printf("num_thread: %5d\n",num_thread); + fflush(stdout); + } + MPI_Barrier( grid->comm ); - TIC(t1_sol); + TIC(t1_sol); t = SuperLU_timer_(); - - /* Test input parameters. */ - *info = 0; - if ( n < 0 ) *info = -1; - else if ( nrhs < 0 ) *info = -9; - if ( *info ) { - pxerr_dist("PDGSTRS", grid, -*info); - return; - } - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = supno[n-1] + 1; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Linv_bc_ptr = Llu->Linv_bc_ptr; - Uinv_bc_ptr = Llu->Uinv_bc_ptr; - nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PDGSTRS", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ stat->utime[SOL_COMM] = 0.0; stat->utime[SOL_COMM_PROBE] = 0.0; @@ -897,1366 +925,1478 @@ if(iam==0){ stat->utime[SOL_GEMM] = 0.0; stat->utime[SOL_TRSM] = 0.0; stat->utime[SOL_L] = 0.0; - - + + #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrs()"); + CHECK_MALLOC(iam, "Enter pdgstrs()"); #endif - stat->ops[SOLVE] = 0.0; - Llu->SolveMsgSent = 0; + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ - if ( !(fmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for fmod[]."); - for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; - - if ( !(frecv = intCalloc_dist(nlb)) ) - ABORT("Malloc fails for frecv[]."); - Llu->frecv = frecv; + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ - k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; - if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) - ABORT("Malloc fails for send_req[]."); + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + + if ( !(frecv = intCalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; + + if ( !(leaf_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) ) + ABORT("Malloc fails for leaf_send[]."); + nleaf_send=0; + + if ( !(root_send = intMalloc_dist(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))) ) + ABORT("Malloc fails for root_send[]."); + nroot_send=0; + #ifdef _CRAY - ftcs1 = _cptofcd("L", strlen("L")); - ftcs2 = _cptofcd("N", strlen("N")); - ftcs3 = _cptofcd("U", strlen("U")); + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); #endif + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; - /* Obtain ilsum[] and ldalsum for process column 0. */ - ilsum = Llu->ilsum; - ldalsum = Llu->ldalsum; - - /* Allocate working storage. */ - knsupc = sp_ienv_dist(3); + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); nprobe = sp_ienv_dist(4); - maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) - ABORT("Calloc fails for lsum[]."); - if ( !(x = doubleCalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) - ABORT("Calloc fails for x[]."); - if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf[]."); - if ( !(recvbuf_on = doubleMalloc_dist(maxrecvsz)) ) - ABORT("Malloc fails for recvbuf_on[]."); - // // if ( !(recvbuf_BC_fwd = doubleMalloc_dist(maxrecvsz*nfrecvx)) ) // this needs to be optimized for 1D row mapping - // // ABORT("Malloc fails for recvbuf_BC_fwd[]."); - if ( !(rtemp = doubleCalloc_dist(ldalsum * nrhs)) ) - ABORT("Malloc fails for rtemp[]."); - if ( !(ipiv = intCalloc_dist(knsupc)) ) - ABORT("Malloc fails for ipiv[]."); - - - - // if ( !(recvbufall = doubleMalloc_dist(n)) ) - // ABORT("Malloc fails for recvbufall[]."); - - /*--------------------------------------------------- - * Forward solve Ly = b. - *---------------------------------------------------*/ - /* Redistribute B into X on the diagonal processes. */ - pdReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); - - + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H); + sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d; + +#ifdef _OPENMP + if ( !(lsum = doubleMalloc_dist(sizelsum*num_thread))) + ABORT("Calloc fails for lsum[]."); +#pragma omp parallel default(shared) private(thread_id,ii) + { + thread_id = omp_get_thread_num (); + for(ii=0;ii=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t); - t = SuperLU_timer_(); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t); + fflush(stdout); + t = SuperLU_timer_(); #endif - - - - /* Set up the headers in lsum[]. */ - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - il = LSUM_BLK( lk ); - lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + + /* Set up the headers in lsum[]. */ + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + } + ii += knsupc; } - ii += knsupc; - } - - - /* --------------------------------------------------------- - Precompute mapping from Lrowind_bc_ptr to lsum. - --------------------------------------------------------- */ - - nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !(Llu->Lrowind_bc_2_lsum = - (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) - ABORT("Malloc fails for Lrowind_bc_2_lsum[]."); - - - for (ljb = 0; ljb < nsupers_j; ++ljb) { - - - if(Lrowind_bc_ptr[ljb]!=NULL){ - - jb = mycol+ljb*grid->npcol; - - knsupc = SuperSize( jb ); - krow = PROW( jb, grid ); - nrbl = Lrowind_bc_ptr[ljb][0]; - - - if(myrow==krow){ /* skip the diagonal block */ - nlb_nodiag=nrbl-1; - idx_i = nlb_nodiag+2; - m = Lrowind_bc_ptr[ljb][1]-knsupc; - }else{ - nlb_nodiag=nrbl; - idx_i = nlb_nodiag; - m = Lrowind_bc_ptr[ljb][1]; - } - - if(nlb_nodiag>0){ - if ( !(Llu->Lrowind_bc_2_lsum[ljb] = intMalloc_dist(m*nrhs)) ) - ABORT("Malloc fails for Lrowind_bc_2_lsum[ljb][]."); - idx_r=0; - RHS_ITERATE(j) - for (lb = 0; lb < nlb_nodiag; ++lb) { - lptr1_tmp = Llu->Lindval_loc_bc_ptr[ljb][lb+idx_i]; - ik = Lrowind_bc_ptr[ljb][lptr1_tmp]; /* Global block number, row-wise. */ - iknsupc = SuperSize( ik ); - nbrow = Lrowind_bc_ptr[ljb][lptr1_tmp+1]; - lk = LBi( ik, grid ); /* Local block number, row-wise. */ - il = LSUM_BLK( lk ); - rel = xsup[ik]; /* Global row index of block ik. */ - for (ii = 0; ii < nbrow; ++ii) { - irow = Lrowind_bc_ptr[ljb][lptr1_tmp+LB_DESCRIPTOR+ii] - rel; /* Relative row. */ - Llu->Lrowind_bc_2_lsum[ljb][idx_r++] = il+irow+ j*iknsupc; - - - // {RHS_ITERATE(j) - // Llu->Lrowind_bc_2_lsum[ljb][idx_r+j*m] = il+irow+ j*iknsupc; - // } - // idx_r++; - - } - } - } - } - } + /* --------------------------------------------------------- + Precompute mapping from Lrowind_bc_ptr to lsum. + --------------------------------------------------------- */ + + + + // nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + // if ( !(Llu->Lrowind_bc_2_lsum = + // (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) + // ABORT("Malloc fails for Lrowind_bc_2_lsum[]."); - - - - /* --------------------------------------------------------- - Initialize the async Bcast trees on all processes. - --------------------------------------------------------- */ + // for (ljb = 0; ljb < nsupers_j; ++ljb) { + + // if(Lrowind_bc_ptr[ljb]!=NULL){ + + // jb = mycol+ljb*grid->npcol; + + // knsupc = SuperSize( jb ); + // krow = PROW( jb, grid ); + // nrbl = Lrowind_bc_ptr[ljb][0]; + + // if(myrow==krow){ /* skip the diagonal block */ + // nlb_nodiag=nrbl-1; + // idx_i = nlb_nodiag+2; + // m = Lrowind_bc_ptr[ljb][1]-knsupc; + // }else{ + // nlb_nodiag=nrbl; + // idx_i = nlb_nodiag; + // m = Lrowind_bc_ptr[ljb][1]; + // } + + // if(nlb_nodiag>0){ + // if ( !(Llu->Lrowind_bc_2_lsum[ljb] = intMalloc_dist(((m*nrhs + (aln_i - 1)) / aln_i) * aln_i)) ) + // ABORT("Malloc fails for Lrowind_bc_2_lsum[ljb][]."); + // idx_r=0; + // RHS_ITERATE(j) + // for (lb = 0; lb < nlb_nodiag; ++lb) { + // lptr1_tmp = Llu->Lindval_loc_bc_ptr[ljb][lb+idx_i]; + // ik = Lrowind_bc_ptr[ljb][lptr1_tmp]; /* Global block number, row-wise. */ + // iknsupc = SuperSize( ik ); + // nbrow = Lrowind_bc_ptr[ljb][lptr1_tmp+1]; + // lk = LBi( ik, grid ); /* Local block number, row-wise. */ + // il = LSUM_BLK( lk ); + // rel = xsup[ik]; /* Global row index of block ik. */ + // for (ii = 0; ii < nbrow; ++ii) { + // irow = Lrowind_bc_ptr[ljb][lptr1_tmp+LB_DESCRIPTOR+ii] - rel; /* Relative row. */ + // Llu->Lrowind_bc_2_lsum[ljb][idx_r++] = il+irow+ j*iknsupc; + // } + // } + // }else{ + // Llu->Lrowind_bc_2_lsum[ljb]=NULL; + // } + // }else{ + // Llu->Lrowind_bc_2_lsum[ljb]=NULL; + // } + // } + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ - if ( !( LBTree_active = intCalloc_dist(nsupers_j)) ) - ABORT("Calloc fails for LBTree_active."); - LBList = StdList_Init(); - stat->MaxActiveBTrees=0; - if ( !( LBTree_finish = intCalloc_dist(nsupers_j)) ) - ABORT("Calloc fails for LBTree_finish."); - + nbtree = 0; for (lk=0;lk0)nfrecvx_buf++; - } - BcTree_allocateRequest(LBtree_ptr[lk]); + // printf("LBtree_ptr lk %5d\n",lk); + if(BcTree_IsRoot(LBtree_ptr[lk])==NO){ + nbtree++; + if(BcTree_getDestCount(LBtree_ptr[lk])>0)nfrecvx_buf++; + } + BcTree_allocateRequest(LBtree_ptr[lk]); } } - - - + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - if ( !( LRTree_active = intCalloc_dist(nsupers_i)) ) - ABORT("Calloc fails for LRTree_active."); - LRList = StdList_Init(); - stat->MaxActiveRTrees=0; - if ( !( LRTree_finish = intCalloc_dist(nsupers_i)) ) - ABORT("Calloc fails for LBTree_finish."); if ( !( leafsups = (int_t*)intCalloc_dist(nsupers_i)) ) - ABORT("Calloc fails for leafsups."); - - - - // if(iam==8){ - // printf("nsupers_j %5d \n",nsupers_j); - // fflush(stdout); - // for (lk=0;lknprow; /* not sure */ if(gb=2 ) - printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n, nbtree %4d\n, nrtree %4d\n", - iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree); + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree); fflush(stdout); #endif - - + + #if ( PRNTlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup L-solve time\t%8.3f\n", t); - t = SuperLU_timer_(); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); #endif - - /* --------------------------------------------------------- - Solve the leaf nodes first by all the diagonal processes. - --------------------------------------------------------- */ + + +#if ( VAMPIR>=1 ) + // VT_initialize(); + VT_traceon(); +#endif + + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) - printf("(%2d) nleaf %4d\n", iam, nleaf); + printf("(%2d) nleaf %4d\n", iam, nleaf); fflush(stdout); #endif - - -// #if ( PROFlevel>=1 ) - // TIC(t1); - // msgcnt[1] = maxrecvsz; -// #endif - - // for (jj=0;jj<19*3;jj++){ - // printf("Lindval %5d\n",Llu->Lindval_loc_bc_ptr[0][jj]); - // fflush(stdout); - // } - - - +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { - // for (k = 0; k < nsupers && nleaf; ++k) { - for (jj=0;jj=1 ) - TIC(t1); -#endif - if(Llu->inv == 1){ - Linv = Linv_bc_ptr[lk]; + TIC(t1); +#endif +#ifdef _OPENMP + thread_id = omp_get_thread_num (); +#else + thread_id = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + + // if ( frecv[lk]==0 && fmod[lk]==0 ) { + // fmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + + nsupr = lsub[1]; + + + + if(Llu->inv == 1){ + Linv = Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc, 1, 1 ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); #else - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat->utime[SOL_TRSM] += t2; - -#endif + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - // --nleaf; + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + // --nleaf; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); + printf("(%2d) Solve X[%2d]\n", iam, k); #endif - /* - * Send Xk to process column Pc[k]. - */ - - // if(LBtree_ptr[lk]!=NULL){ - // lib = LBi( k, grid ); /* Local block number, row-wise. */ - // ii = X_BLK( lib ); - // BcTree_SetLocalBuffer(LBtree_ptr[lk],&x[ii - XK_H]); - // BcTree_SetDataReady(LBtree_ptr[lk]); - // done = BcTree_Progress(LBtree_ptr[lk]); - // assert(done==NO); - - if(LBtree_ptr[lk]!=NULL){ - lib = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lib ); - BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); - } - - // for (p = 0; p < Pr; ++p) { - // if ( fsendx_plist[lk][p] != EMPTY ) { - // pi = PNUM( p, kcol, grid ); - - - // MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - // MPI_DOUBLE, pi, Xk, grid->comm, - // &send_req[Llu->SolveMsgSent++]); -// #if 0 - // MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - // MPI_DOUBLE, pi, Xk, grid->comm ); -// #endif + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ + lib = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send; + leaf_send[nleaf_send_tmp-1] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + } + } + } + } + } -// #if ( DEBUGlevel>=2 ) - // printf("(%2d) Sent X[%2.0f] to P %2d\n", - // iam, x[ii-XK_H], pi); -// #endif - // } - // } - - - - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ +#if ( VTUNE>=1 ) + __itt_resume(); +#endif - + jj=0; +#ifdef _OPENMP +#pragma omp parallel default (shared) private(thread_id) + { + thread_id = omp_get_thread_num (); +#else + { + thread_id = 0; +#endif - - // printf("%5d Iam in %5d\n",iam,nleaf); - // fflush(stdout); - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - // printf("%5d Iam out %5d",iam,nleaf); - // fflush(stdout); - - - // } - // } /* if diagonal process ... */ - } /* for k ... */ - +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop private (i,k,ii,knsupc,lk,nb,lptr,luptr,lsub,lusup,thread_id) untied num_tasks(num_thread*8) nogroup +#endif - int data_recv=0; - int msg_num = nfrecvx+nfrecvmod; - int nfpost=0,nfrecv=0; - jj=0; - recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; - /* ----------------------------------------------------------- - Compute the internal nodes asynchronously by all processes. - ----------------------------------------------------------- */ - - while ( nfrecvx || nfrecvmod ) { /* While not finished. */ - - // printf("iam %5d, nfrecv %5d, num %5d jj %5d nleaf %5d\n",iam,nfrecv,msg_num,jj,nleaf); - // fflush(stdout); + for (jj=0;jj=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ]); + } + } -#if ( PROFlevel>=1 ) - TIC(t1); - msgcnt[1] = maxrecvsz; -#endif +#if ( VTUNE>=1 ) + __itt_pause(); +#endif - /* Receive a message. */ - MPI_Recv( recvbuf0, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - + /* ----------------------------------------------------------- + Compute the internal nodes asynchronously by all processes. + ----------------------------------------------------------- */ - +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { + for ( nfrecv =0; nfrecv=1 ) - + TIC(t1); + // msgcnt[1] = maxrecvsz; +#endif - - - - // if(iam==0){ - // printf("time: %8.5f\n") - // } - - - TOC(t2, t1); - stat->utime[SOL_COMM] += t2; - - msg_cnt += 1; - msg_vol += msgcnt[1] * dword; + recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_DOUBLE, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + // MPI_Irecv(recvbuf0,maxrecvsz,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req); + // ready=0; + // while(ready==0){ + // MPI_Test(&req,&ready,&status); + // #pragma omp taskyield + // } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + + msg_cnt += 1; + msg_vol += maxrecvsz * dword; #endif - - - k = *recvbuf0; - - - - - + { + k = *recvbuf0; #if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif - - if(status.MPI_TAG0){ - - // msgsize = SuperSize( status.MPI_TAG )*nrhs+XK_H; - // for (i = 0; i < msgsize; ++i) - // recvbuf_BC_fwd[i + nfrecvx_buf*maxrecvsz] = recvbuf0[i]; - - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]); - // nfrecvx_buf++; - - - - - - BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0); - nfrecvx_buf++; - } - - - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - if ( lsub ) { - nb = lsub[0]; - lptr = BC_HEADER; - luptr = 0; - knsupc = SuperSize( k ); - - /* - * Perform local block modifications: lsum[i] -= L_i,k * X[k] - */ - dlsum_fmod_inv(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - } /* if lsub */ + if(status.MPI_TAG==BC_L){ + // --nfrecvx; + nfrecvx_buf++; + { + lk = LBj( k, grid ); /* local block number */ + + if(BcTree_getDestCount(LBtree_ptr[lk])>0){ + + BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0); + // nfrecvx_buf++; + } + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + if(myrow==krow){ + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + }else{ + nb = lsub[0]; + knsupc = SuperSize( k ); + xin = &recvbuf0[XK_H] ; + } + + dlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0); + + } /* if lsub */ + } + + }else if(status.MPI_TAG==RD_L){ + // --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + fmod_tmp=--fmod[lk]; + { + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( fmod_tmp==0 ) { + if(RdTree_IsRoot(LRtree_ptr[lk])==YES){ + // ii = X_BLK( lk ); + knsupc = SuperSize( k ); + for (ii=1;ii0){ - recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; - } - - }else if(status.MPI_TAG>=nsupers && status.MPI_TAG=1 ) - TIC(t1); - #endif - - - if(Llu->inv == 1){ - Linv = Linv_bc_ptr[lk]; - #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); - #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc, 1, 1 ); - #else - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Linv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); - #endif - for (i=0 ; i=1 ) + TIC(t1); +#endif + if(Llu->inv == 1){ + Linv = Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + for (i=0 ; i=1 ) - TOC(t2, t1); - stat->utime[SOL_TRSM] += t2; - - #endif +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif - stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; - #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); - #endif - - /* - * Send Xk to process column Pc[k]. - */ - // // // kcol = PCOL( k, grid ); - // // // for (p = 0; p < Pr; ++p) { - // // // if ( fsendx_plist[lk][p] != EMPTY ) { - // // // pi = PNUM( p, kcol, grid ); - - - // // // MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, - // // // MPI_DOUBLE, pi, Xk, grid->comm, - // // // &send_req[Llu->SolveMsgSent++]); - // // // #if 0 - // // // MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - // // // MPI_DOUBLE, pi, Xk, grid->comm ); - // // // #endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif - + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + } + /* + * Perform local block modifications. + */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + dlsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0); + } /* if lsub */ + // } + + }else{ + + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + + for (ii=1;ii=2 ) - // // // printf("(%2d) Sent X[%2.0f] to P %2d\n", - // // // iam, x[ii-XK_H], pi); - // // // #endif - // // // } - // // // } - - if(LBtree_ptr[lk]!=NULL){ - BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); - } - - - /* - * Perform local block modifications. - */ - nb = lsub[0] - 1; - lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; - luptr = knsupc; /* Skip diagonal block L(k,k). */ - - - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, - fmod, nb, lptr, luptr, xsup, grid, Llu, - send_req, stat); - - } - - }else{ - - il = LSUM_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf0[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - lsum[i + il + j*knsupc] += tempv[i + j*knsupc]; - } - if ( (frecv[lk])==0 && fmod[lk]==0 ) { - fmod[lk] = -1; - RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il-LSUM_H]); - } - } - - // nfrecvx_buf++; - recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; - - - - } /* check Tag */ - } /* while not finished ... */ #if ( PRNTlevel>=1 ) - t = SuperLU_timer_() - t; - stat->utime[SOL_L] = t; - if ( !iam ) printf(".. L-solve time\t%8.4f\n", t); - MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, - MPI_MAX, 0, grid->comm); - if ( !iam ) printf(".. L-solve time (MAX) \t%8.4f\n", tmax); - - MPI_Reduce (&stat->utime[SOL_GEMM], &tmax, 1, MPI_DOUBLE, - MPI_MAX, 0, grid->comm); - if ( !iam ) printf(".. L-GEMM time (MAX) \t%8.4f\n", tmax); - - MPI_Reduce (&stat->utime[SOL_COMM], &tmax, 1, MPI_DOUBLE, - MPI_MAX, 0, grid->comm); - if ( !iam ) printf(".. L-COMM time (MAX) \t%8.4f\n", tmax); - - - t = SuperLU_timer_(); -#endif + t = SuperLU_timer_() - t; + stat->utime[SOL_L] = t; + if ( !iam ) { + printf(".. L-solve time\t%8.4f\n", t); + fflush(stdout); + } -#if ( DEBUGlevel==2 ) - { - printf("(%d) .. After L-solve: y =\n", iam); - for (i = 0, k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - ii = X_BLK( lk ); - for (j = 0; j < knsupc; ++j) - printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - } -#endif - SUPERLU_FREE(fmod); - SUPERLU_FREE(frecv); + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, + MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. L-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } - /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ + t = SuperLU_timer_(); +#endif - // for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); - // Llu->SolveMsgSent = 0; - - for (lk=0;lkcomm ); + } } - } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(leaf_send); + SUPERLU_FREE(leafsups); + SUPERLU_FREE(recvbuf_BC_fwd); + + // for (ljb = 0; ljb < nsupers_j; ++ljb) + // if(Llu->Lrowind_bc_2_lsum[ljb]!=NULL) + // SUPERLU_FREE(Llu->Lrowind_bc_2_lsum[ljb]); + // SUPERLU_FREE(Llu->Lrowind_bc_2_lsum); - for (lk=0;lkcomm ); - - - /*--------------------------------------------------- - * Back solve Ux = y. - * - * The Y components from the forward solve is already - * on the diagonal processes. - *---------------------------------------------------*/ - - /* Save the count to be altered so it can be used by - subsequent call to PDGSTRS. */ - if ( !(bmod = intMalloc_dist(nlb)) ) - ABORT("Calloc fails for bmod[]."); - for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; - if ( !(brecv = intMalloc_dist(nlb)) ) - ABORT("Malloc fails for brecv[]."); - Llu->brecv = brecv; - - /* - * Compute brecv[] and nbrecvmod counts on the diagonal processes. - */ - { - superlu_scope_t *scp = &grid->rscp; -#if 1 - for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); /* root process in this row scope */ - if ( mycol != kcol && bmod[lk] ) - mod_bit[lk] = 1; /* Contribution from off-diagonal */ - } - } + for (lk=0;lkcomm ); - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); - // MPI_Iallreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm, &req); - // MPI_Wait(&req,&status); +#if ( VAMPIR>=1 ) + VT_traceoff(); + VT_finalize(); +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(bmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; + if ( !(brecv = intCalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + Llu->brecv = brecv; + + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + // if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) + // ABORT("Malloc fails for send_req[]."); + + /* Re-initialize lsum to zero. Each block header is already in place. */ +#ifdef _OPENMP + + #pragma omp parallel default(shared) private(thread_id,k,krow,knsupc,lk,il,dest,j,i) + { + thread_id = omp_get_thread_num (); + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + thread_id*sizelsum] = zero; + } + } + } + } + +#else for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* local block number */ - kcol = PCOL( k, grid ); /* root process in this row scope. */ - if ( mycol == kcol ) { /* diagonal process */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + + for (jj = 0; jj < num_thread; ++jj) { + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc + jj*sizelsum] = zero; + } + } } - } } +#endif + -#else /* old */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - lk = LBi( k, grid ); /* Local block number. */ - kcol = PCOL( k, grid ); /* Root process in this row scope. */ - if ( mycol != kcol && bmod[lk] ) - i = 1; /* Contribution from non-diagonal process. */ - else i = 0; - MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, - MPI_SUM, kcol, scp->comm ); - if ( mycol == kcol ) { /* Diagonal process. */ - nbrecvmod += brecv[lk]; - if ( !brecv[lk] && !bmod[lk] ) ++nroot; + // /* Set up additional pointers for the index and value arrays of U. + // nub is the number of local block columns. */ + // nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ + // if ( !(Urbs = (int_t *) intCalloc_dist(3*nub)) ) + // ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + // blocks in a block column. */ + // Urbs1 = Urbs + nub; + // Urbs2 = Urbs + nub*2; + // if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + // ABORT("Malloc fails for Ucb_indptr[]"); + // if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + // ABORT("Malloc fails for Ucb_valptr[]"); + + // /* Count number of row blocks in a block column. + // One pass of the skeleton graph of U. */ + // for (lk = 0; lk < nlb; ++lk) { + // usub = Ufstnz_br_ptr[lk]; + // if ( usub ) { /* Not an empty block row. */ + // /* usub[0] -- number of column blocks in this block row. */ +// #if ( DEBUGlevel>=2 ) + // Ublocks += usub[0]; +// #endif + // i = BR_HEADER; /* Pointer in index array. */ + // for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + // k = usub[i]; /* Global block number */ + // ++Urbs[LBj(k,grid)]; + // i += UB_DESCRIPTOR + SuperSize( k ); + // } + // } + // } + + // /* Set up the vertical linked lists for the row blocks. + // One pass of the skeleton graph of U. */ + // for (lb = 0; lb < nub; ++lb) { + // if ( Urbs[lb] ) { /* Not an empty block column. */ + // if ( !(Ucb_indptr[lb] + // = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + // ABORT("Malloc fails for Ucb_indptr[lb][]"); + // if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + // ABORT("Malloc fails for Ucb_valptr[lb][]"); + // } + // } + // for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + // usub = Ufstnz_br_ptr[lk]; + // if ( usub ) { /* Not an empty block row. */ + // i = BR_HEADER; /* Pointer in index array. */ + // j = 0; /* Pointer in nzval array. */ + + // // gik = lk * grid->nprow + myrow;/* Global block number, row-wise. */ + // // iklrow = FstBlockC( gik+1 ); + + // for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + // k = usub[i]; /* Global block number, column-wise. */ + // ljb = LBj( k, grid ); /* Local block number, column-wise. */ + // Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + + + + // Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + // Ucb_valptr[ljb][Urbs1[ljb]] = j; + + // // knsupc = SuperSize( k ); + // // nbrow = 0; + // // for (jj = 0; jj < knsupc; ++jj) { + // // fnz = usub[i +UB_DESCRIPTOR+ jj]; + // // if ( fnz < iklrow ) { + // // if(nbrow=2 ) - printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); - assert( brecv[lk] < Pc ); -#endif + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); } - } - } -#endif - } - - /* Re-initialize lsum to zero. Each block header is already in place. */ - for (k = 0; k < nsupers; ++k) { - krow = PROW( k, grid ); - if ( myrow == krow ) { - knsupc = SuperSize( k ); - lk = LBi( k, grid ); - il = LSUM_BLK( lk ); - dest = &lsum[il]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; - } - } - } - - /* Set up additional pointers for the index and value arrays of U. - nub is the number of local block columns. */ - nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ - if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) - ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero - blocks in a block column. */ - Urbs1 = Urbs + nub; - if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) - ABORT("Malloc fails for Ucb_indptr[]"); - if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) - ABORT("Malloc fails for Ucb_valptr[]"); - - /* Count number of row blocks in a block column. - One pass of the skeleton graph of U. */ - for (lk = 0; lk < nlb; ++lk) { - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - /* usub[0] -- number of column blocks in this block row. */ -#if ( DEBUGlevel>=2 ) - Ublocks += usub[0]; -#endif - i = BR_HEADER; /* Pointer in index array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number */ - ++Urbs[LBj(k,grid)]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } - - /* Set up the vertical linked lists for the row blocks. - One pass of the skeleton graph of U. */ - for (lb = 0; lb < nub; ++lb) { - if ( Urbs[lb] ) { /* Not an empty block column. */ - if ( !(Ucb_indptr[lb] - = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) - ABORT("Malloc fails for Ucb_indptr[lb][]"); - if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) - ABORT("Malloc fails for Ucb_valptr[lb][]"); - } - } - for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ - usub = Ufstnz_br_ptr[lk]; - if ( usub ) { /* Not an empty block row. */ - i = BR_HEADER; /* Pointer in index array. */ - j = 0; /* Pointer in nzval array. */ - for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ - k = usub[i]; /* Global block number, column-wise. */ - ljb = LBj( k, grid ); /* Local block number, column-wise. */ - Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; - Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; - Ucb_valptr[ljb][Urbs1[ljb]] = j; - ++Urbs1[ljb]; - j += usub[i+1]; - i += UB_DESCRIPTOR + SuperSize( k ); - } - } - } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ -#if ( DEBUGlevel>=2 ) - for (p = 0; p < Pr*Pc; ++p) { - if (iam == p) { - printf("(%2d) .. Ublocks %d\n", iam, Ublocks); - for (lb = 0; lb < nub; ++lb) { - printf("(%2d) Local col %2d: # row blocks %2d\n", - iam, lb, Urbs[lb]); - if ( Urbs[lb] ) { - for (i = 0; i < Urbs[lb]; ++i) - printf("(%2d) .. row blk %2d:\ - lbnum %d, indpos %d, valpos %d\n", - iam, i, - Ucb_indptr[lb][i].lbnum, - Ucb_indptr[lb][i].indpos, - Ucb_valptr[lb][i]); + + + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + nbtree = 0; + for (lk=0;lk0)nbrecvx_buf++; + } + BcTree_allocateRequest(UBtree_ptr[lk]); } - } - } - MPI_Barrier( grid->comm ); - } - for (p = 0; p < Pr*Pc; ++p) { - if ( iam == p ) { - printf("\n(%d) bsendx_plist[][]", iam); - for (lb = 0; lb < nub; ++lb) { - printf("\n(%d) .. local col %2d: ", iam, lb); - for (i = 0; i < Pr; ++i) - printf("%4d", bsendx_plist[lb][i]); - } - printf("\n"); } - MPI_Barrier( grid->comm ); - } -#endif /* DEBUGlevel */ + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + if ( !( rootsups = (int_t*)intCalloc_dist(nsupers_i)) ) + ABORT("Calloc fails for rootsups."); + + nrtree = 0; + nroot=0; + for (lk=0;lknprow; /* not sure */ + if(gb=2 ) + printf("(%2d) nbrecvx %4d, nbrecvmod %4d, nroot %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree); + fflush(stdout); +#endif #if ( PRNTlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. Setup U-solve time\t%8.3f\n", t); - t = SuperLU_timer_(); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); #endif - /* - * Solve the roots first by all the diagonal processes. - */ + /* + * Solve the roots first by all the diagonal processes. + */ #if ( DEBUGlevel>=2 ) - printf("(%2d) nroot %4d\n", iam, nroot); - fflush(stdout); + printf("(%2d) nroot %4d\n", iam, nroot); + fflush(stdout); #endif - for (k = nsupers-1; k >= 0 && nroot; --k) { - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ - knsupc = SuperSize( k ); - lk = LBi( k, grid ); /* Local block number, row-wise. */ - if ( brecv[lk]==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - ii = X_BLK( lk ); - lk = LBj( k, grid ); /* Local block number, column-wise */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; - - if(Llu->inv == 1){ - Uinv = Uinv_bc_ptr[lk]; + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,thread_id,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp) nogroup +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif +#ifdef _OPENMP + thread_id = omp_get_thread_num (); +#else + thread_id = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + // bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + + if(Llu->inv == 1){ + + Uinv = Uinv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc, 1, 1 ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); #else - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #endif - - for (i=0 ; iops[SOLVE] += knsupc * (knsupc + 1) * nrhs; - --nroot; -#if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); + // for (i=0 ; i=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif -// #if ( PROFlevel>=1 ) - // TIC(t1); - // msgcnt[1] = knsupc * nrhs + XK_H; -// #endif + /* + * Send Xk to process column Pc[k]. + */ - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++]); -#if 0 - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, - grid->comm ); + if(UBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture #endif + nroot_send_tmp = ++nroot_send; + root_send[nroot_send_tmp-1] = lk; + + // lib = LBi( k, grid ); /* Local block number, row-wise. */ + // ii = X_BLK( lib ); + // BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H]); + } + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat_loc, root_send, &nroot_send, sizelsum,sizertemp); + + } /* for k ... */ + } +} -// #if ( PROFlevel>=1 ) - // TOC(t2, t1); - // stat->utime[SOL_COMM] += t2; - - // msg_cnt += 1; - // msg_vol += msgcnt[1] * dword; -// #endif + +for (i=0;i=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H]); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ]); + } +} -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif - } - } /* - * Perform local block modifications: lsum[i] -= U_i,k * X[k] + * Compute the internal nodes asychronously by all processes. */ - if ( Urbs[lk] ) - dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); - } /* if root ... */ - } /* if diagonal process ... */ - } /* for k ... */ - - - /* - * Compute the internal nodes asychronously by all processes. - */ - -// printf("nbrecvx %5d nbrecvmod %5d\n",nbrecvx,nbrecvmod); -// fflush(stdout); - - while ( nbrecvx || nbrecvmod ) { /* While not finished. */ - - -// #if ( PROFlevel>=1 ) - // TIC(t1); - // msgcnt[1] = maxrecvsz; -// #endif - - /* Receive a message. */ - MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, - MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); - -// #if ( PROFlevel>=1 ) - // TOC(t2, t1); - // stat->utime[SOL_COMM] += t2; - - // msg_cnt += 1; - // msg_vol += msgcnt[1] * dword; -// #endif - - k = *recvbuf; +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + for ( nbrecv =0; nbrecv=1 ) + TIC(t1); +#endif + + recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_DOUBLE, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + + msg_cnt += 1; + msg_vol += maxrecvsz * dword; +#endif + + k = *recvbuf0; #if ( DEBUGlevel>=2 ) - printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); + fflush(stdout); #endif - switch ( status.MPI_TAG ) { - case Xk: - --nbrecvx; - lk = LBj( k, grid ); /* Local block number, column-wise. */ - /* - * Perform local block modifications: - * lsum[i] -= U_i,k * X[k] - */ - dlsum_bmod_inv(lsum, x, &recvbuf[XK_H], rtemp, nrhs, k, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - send_req, stat); + if(status.MPI_TAG==BC_U){ + // --nfrecvx; + nbrecvx_buf++; + + lk = LBj( k, grid ); /* local block number */ - break; + if(BcTree_getDestCount(UBtree_ptr[lk])>0){ - case LSUM: /* Receiver must be a diagonal process */ - --nbrecvmod; - lk = LBi( k, grid ); /* Local block number, row-wise. */ - ii = X_BLK( lk ); - knsupc = SuperSize( k ); - tempv = &recvbuf[LSUM_H]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) - x[i + ii + j*knsupc] += tempv[i + j*knsupc]; - } + BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0); + // nfrecvx_buf++; + } - if ( (--brecv[lk])==0 && bmod[lk]==0 ) { - bmod[lk] = -1; /* Do not solve X[k] in the future. */ - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - nsupr = lsub[1]; + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + dlsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat_loc, sizelsum,sizertemp); + }else if(status.MPI_TAG==RD_U){ + + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[lk]; + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( bmod_tmp==0 ) { + if(RdTree_IsRoot(URtree_ptr[lk])==YES){ + + knsupc = SuperSize( k ); + for (ii=1;iiinv == 1){ + + Uinv = Uinv_bc_ptr[lk]; - if(Llu->inv == 1){ - - Uinv = Uinv_bc_ptr[lk]; - #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc, 1, 1 ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); #else - dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, - &alpha, Uinv, &knsupc, &x[ii], - &knsupc, &beta, rtemp, &knsupc ); + dgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); #endif - - for (i=0 ; iops[SOLVE] += knsupc * (knsupc + 1) * nrhs; +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, k); -#endif - /* - * Send Xk to process column Pc[k]. - */ - kcol = PCOL( k, grid ); - for (p = 0; p < Pr; ++p) { - if ( bsendx_plist[lk][p] != EMPTY ) { - pi = PNUM( p, kcol, grid ); - -// #if ( PROFlevel>=1 ) - // TIC(t1); - // msgcnt[1] = knsupc * nrhs + XK_H; -// #endif - - MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#if 0 - MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, - grid->comm ); + printf("(%2d) Solve X[%2d]\n", iam, k); #endif -// #if ( PROFlevel>=1 ) - // TOC(t2, t1); - // stat->utime[SOL_COMM] += t2; - - // msg_cnt += 1; - // msg_vol += msgcnt[1] * dword; -// #endif - -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii - XK_H], pi); -#endif + /* + * Send Xk to process column Pc[k]. + */ + if(UBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H]); + } + + + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + dlsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat_loc, sizelsum,sizertemp); + + }else{ + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + + for (ii=1;ii=2 ) - default: - printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); - break; -#endif + } /* while not finished ... */ + } +#if ( PRNTlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. U-solve time\t%8.4f\n", t); + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, + MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. U-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } + t = SuperLU_timer_(); +#endif - } /* switch */ - } /* while not finished ... */ -#if ( PRNTlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam ) printf(".. U-solve time\t%8.3f\n", t); - t = SuperLU_timer_(); -#endif #if ( DEBUGlevel>=2 ) - { - double *x_col; - int diag; - printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); - ii = 0; - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - diag = PNUM( krow, kcol, grid); - if ( iam == diag ) { /* Diagonal process. */ - lk = LBi( k, grid ); - jj = X_BLK( lk ); - x_col = &x[jj]; - RHS_ITERATE(j) { - for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ - printf("\t(%d)\t%4d\t%.10f\n", - iam, xsup[k]+i, x_col[i]); - } - x_col += knsupc; + { + double *x_col; + int diag; + printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + diag = PNUM( krow, kcol, grid); + if ( iam == diag ) { /* Diagonal process. */ + lk = LBi( k, grid ); + jj = X_BLK( lk ); + x_col = &x[jj]; + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ + printf("\t(%d)\t%4d\t%.10f\n", + iam, xsup[k]+i, x_col[i]); + } + x_col += knsupc; + } + } + ii += knsupc; + } /* for k ... */ } - } - ii += knsupc; - } /* for k ... */ - } #endif - pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, - ScalePermstruct, Glu_persist, grid, SOLVEstruct); + pdReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, + ScalePermstruct, Glu_persist, grid, SOLVEstruct); #if ( PRNTlevel>=1 ) - t = SuperLU_timer_() - t; - if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t); - t = SuperLU_timer_(); + t = SuperLU_timer_() - t; + if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t); + t = SuperLU_timer_(); #endif - - - - /* Deallocate storage. */ - SUPERLU_FREE(rtemp); - SUPERLU_FREE(lsum); - SUPERLU_FREE(x); - SUPERLU_FREE(recvbuf); - for (i = 0; i < nub; ++i) { - if ( Urbs[i] ) { - SUPERLU_FREE(Ucb_indptr[i]); - SUPERLU_FREE(Ucb_valptr[i]); - } - } - SUPERLU_FREE(Ucb_indptr); - SUPERLU_FREE(Ucb_valptr); - SUPERLU_FREE(Urbs); - SUPERLU_FREE(bmod); - SUPERLU_FREE(brecv); - /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ - for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); - SUPERLU_FREE(send_req); + double tmp1=0; + double tmp2=0; + double tmp3=0; + double tmp4=0; + for(i=0;iutime[SOL_TRSM]); + tmp2 = MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]); + tmp3 = MAX(tmp3,stat_loc[i]->utime[SOL_COMM]); + tmp4 += stat_loc[i]->ops[SOLVE]; +#if ( PRNTlevel>=2 ) + if(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]); +#endif + } + + + stat->utime[SOL_TRSM] += tmp1; + stat->utime[SOL_GEMM] += tmp2; + stat->utime[SOL_COMM] += tmp3; + stat->ops[SOLVE]+= tmp4; + + + /* Deallocate storage. */ + SUPERLU_FREE(stat_loc); + SUPERLU_FREE(rtemp); + SUPERLU_FREE(lsum); + SUPERLU_FREE(x); + // SUPERLU_FREE(recvbuf); + + + // for (i = 0; i < nub; ++i) { + // if ( Urbs[i] ) { + // SUPERLU_FREE(Ucb_indptr[i]); + // SUPERLU_FREE(Ucb_valptr[i]); + // } + // } + // SUPERLU_FREE(Ucb_indptr); + // SUPERLU_FREE(Ucb_valptr); + // SUPERLU_FREE(Urbs); + + + SUPERLU_FREE(bmod); + SUPERLU_FREE(brecv); + SUPERLU_FREE(root_send); + + SUPERLU_FREE(rootsups); + SUPERLU_FREE(recvbuf_BC_fwd); + + for (lk=0;lkcomm ); + + + + + + /*for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]);*/ + + // for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Wait(&send_req[i], &status); + // SUPERLU_FREE(send_req); + + // MPI_Barrier( grid->comm ); - MPI_Barrier( grid->comm ); - #if ( PROFlevel>=2 ) - { - float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; - - MPI_Reduce (&msg_cnt, &msg_cnt_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Reduce (&msg_cnt, &msg_cnt_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - MPI_Reduce (&msg_vol, &msg_vol_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Reduce (&msg_vol, &msg_vol_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - if (!iam) { - printf ("\tPDGSTRS comm stat:" - "\tAvg\tMax\t\tAvg\tMax\n" - "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", - msg_cnt_sum / Pr / Pc, msg_cnt_max, - msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); - } - } + { + float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; + + MPI_Reduce (&msg_cnt, &msg_cnt_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_cnt, &msg_cnt_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if (!iam) { + printf ("\tPDGSTRS comm stat:" + "\tAvg\tMax\t\tAvg\tMax\n" + "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", + msg_cnt_sum / Pr / Pc, msg_cnt_max, + msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); + } + } #endif - - TOC(t2_sol,t1_sol); - stat->utime[SOLVE] = t2_sol; -#if ( PROFlevel>=1 ) - { - int_t i, P = grid->nprow*grid->npcol; - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. Msg_vol breakdown:\tpr\tpc\tMB\tCNT\n"); - fflush(stdout); - for (i = 0; i < P; ++i) { - if ( iam == i) { - printf("\t\t%5d %5d %15.4e %10d\n", myrow, mycol , msg_vol * 1e-6, (int_t)msg_cnt); - fflush(stdout); - } - MPI_Barrier( grid->comm ); - } - fflush(stdout); - sleep(2.0); - MPI_Barrier( grid->comm ); - } -#endif + TOC(t2_sol,t1_sol); + stat->utime[SOLVE] = t2_sol; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrs()"); + CHECK_MALLOC(iam, "Exit pdgstrs()"); #endif - return; -} /* PDGSTRS */ + return; + } /* PDGSTRS */ diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c index 7af94c1a..b9abffef 100644 --- a/SRC/pdgstrs_lsum.c +++ b/SRC/pdgstrs_lsum.c @@ -1,13 +1,13 @@ /*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) + Copyright (c) 2003, The Regents of the University of California, through + Lawrence Berkeley National Laboratory (subject to receipt of any required + approvals from U.S. Dept. of Energy) -All rights reserved. + All rights reserved. -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ + The source code is distributed under BSD license, see the file License.txt + at the top-level directory. + */ /*! @file @@ -34,14 +34,20 @@ at the top-level directory. */ #ifdef _CRAY fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, - double*, int*, double*, int*); + double*, int*, double*, int*); fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); + int*, double*, int*, double*, double*, int*); _fcd ftcs1; _fcd ftcs2; _fcd ftcs3; #endif + +// #ifndef CACHELINE +// #define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +// #endif + + /************************************************************************/ /*! \brief * @@ -70,181 +76,181 @@ void dlsum_fmod LocalLU_t *Llu, MPI_Request send_req[], /* input/output */ SuperLUStat_t *stat -) + ) { - // // // // double alpha = 1.0, beta = 0.0; - // // // // double *lusup, *lusup1; - // // // // double *dest; - // // // // int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; - // // // // int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; - // // // // int_t *lsub, *lsub1, nlb1, lptr1, luptr1; - // // // // int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - // // // // int_t *frecv = Llu->frecv; - // // // // int_t **fsendx_plist = Llu->fsendx_plist; - // // // // MPI_Status status; - // // // // int test_flag; - -// // // // #if ( PROFlevel>=1 ) - // // // // double t1, t2; - // // // // float msg_vol = 0, msg_cnt = 0; -// // // // #endif + double alpha = 1.0, beta = 0.0; + double *lusup, *lusup1; + double *dest; + int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; + int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + MPI_Status status; + int test_flag; - -// // // // #if ( PROFlevel>=1 ) - // // // // TIC(t1); -// // // // #endif - - // // // // iam = grid->iam; - // // // // myrow = MYROW( iam, grid ); - // // // // lk = LBj( k, grid ); /* Local block number, column-wise. */ - // // // // lsub = Llu->Lrowind_bc_ptr[lk]; - // // // // lusup = Llu->Lnzval_bc_ptr[lk]; - // // // // nsupr = lsub[1]; - - // // // // for (lb = 0; lb < nlb; ++lb) { - // // // // ik = lsub[lptr]; /* Global block number, row-wise. */ - // // // // nbrow = lsub[lptr+1]; -// // // // #ifdef _CRAY - // // // // SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - // // // // &alpha, &lusup[luptr], &nsupr, xk, - // // // // &knsupc, &beta, rtemp, &nbrow ); -// // // // #elif defined (USE_VENDOR_BLAS) - // // // // dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - // // // // &alpha, &lusup[luptr], &nsupr, xk, - // // // // &knsupc, &beta, rtemp, &nbrow, 1, 1 ); -// // // // #else - // // // // dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - // // // // &alpha, &lusup[luptr], &nsupr, xk, - // // // // &knsupc, &beta, rtemp, &nbrow ); -// // // // #endif - // // // // stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; - - // // // // lk = LBi( ik, grid ); /* Local block number, row-wise. */ - // // // // iknsupc = SuperSize( ik ); - // // // // il = LSUM_BLK( lk ); - // // // // dest = &lsum[il]; - // // // // lptr += LB_DESCRIPTOR; - // // // // rel = xsup[ik]; /* Global row index of block ik. */ - // // // // for (i = 0; i < nbrow; ++i) { - // // // // irow = lsub[lptr++] - rel; /* Relative row. */ - // // // // RHS_ITERATE(j) - // // // // dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; - // // // // } - // // // // luptr += nbrow; - - - -// // // // #if ( PROFlevel>=1 ) - // // // // TOC(t2, t1); - // // // // stat->utime[SOL_GEMM] += t2; - -// // // // #endif +#if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; +#endif +#if ( PROFlevel>=1 ) + TIC(t1); +#endif - - // // // // if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - // // // // ikcol = PCOL( ik, grid ); - // // // // p = PNUM( myrow, ikcol, grid ); - // // // // if ( iam != p ) { -// // // // #ifdef ISEND_IRECV - // // // // MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm, - // // // // &send_req[Llu->SolveMsgSent++] ); -// // // // #else -// // // // #ifdef BSEND - // // // // MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // // // #else - // // // // MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // // // #endif -// // // // #endif -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - // // // // iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -// // // // #endif - // // // // } else { /* Diagonal process: X[i] += lsum[i]. */ - // // // // ii = X_BLK( lk ); - // // // // RHS_ITERATE(j) - // // // // for (i = 0; i < iknsupc; ++i) - // // // // x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; - // // // // if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ - // // // // fmod[lk] = -1; /* Do not solve X[k] in the future. */ - // // // // lk = LBj( ik, grid );/* Local block number, column-wise. */ - // // // // lsub1 = Llu->Lrowind_bc_ptr[lk]; - // // // // lusup1 = Llu->Lnzval_bc_ptr[lk]; - // // // // nsupr1 = lsub1[1]; - - -// // // // #if ( PROFlevel>=1 ) - // // // // TIC(t1); -// // // // #endif - -// // // // #ifdef _CRAY - // // // // STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, - // // // // lusup1, &nsupr1, &x[ii], &iknsupc); -// // // // #elif defined (USE_VENDOR_BLAS) - // // // // dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - // // // // lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); -// // // // #else - // // // // dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, - // // // // lusup1, &nsupr1, &x[ii], &iknsupc); -// // // // #endif - - -// // // // #if ( PROFlevel>=1 ) - // // // // TOC(t2, t1); - // // // // stat->utime[SOL_TRSM] += t2; - -// // // // #endif + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + lusup = Llu->Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + for (lb = 0; lb < nlb; ++lb) { + ik = lsub[lptr]; /* Global block number, row-wise. */ + nbrow = lsub[lptr+1]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow, 1, 1 ); +#else + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#endif + stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + lptr += LB_DESCRIPTOR; + rel = xsup[ik]; /* Global row index of block ik. */ + for (i = 0; i < nbrow; ++i) { + irow = lsub[lptr++] - rel; /* Relative row. */ + RHS_ITERATE(j) + dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; + } + luptr += nbrow; - // // // // stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Solve X[%2d]\n", iam, ik); -// // // // #endif - - // // // // /* - // // // // * Send Xk to process column Pc[k]. - // // // // */ - // // // // for (p = 0; p < grid->nprow; ++p) { - // // // // if ( fsendx_plist[lk][p] != EMPTY ) { - // // // // pi = PNUM( p, ikcol, grid ); -// // // // #ifdef ISEND_IRECV - // // // // MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm, - // // // // &send_req[Llu->SolveMsgSent++] ); -// // // // #else -// // // // #ifdef BSEND - // // // // MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // // // #else - // // // // MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // // // #endif -// // // // #endif -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Sent X[%2.0f] to P %2d\n", - // // // // iam, x[ii-XK_H], pi); -// // // // #endif - // // // // } - // // // // } - // // // // /* - // // // // * Perform local block modifications. - // // // // */ - // // // // nlb1 = lsub1[0] - 1; - // // // // lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - // // // // luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - // // // // dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - // // // // fmod, nlb1, lptr1, luptr1, xsup, - // // // // grid, Llu, send_req, stat); - // // // // } /* if frecv[lk] == 0 */ - // // // // } /* if iam == p */ - // // // // } /* if fmod[lk] == 0 */ - - // // // // } /* for lb ... */ + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_GEMM] += t2; + +#endif + + + + + if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ + ikcol = PCOL( ik, grid ); + p = PNUM( myrow, ikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( lk ); + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + dtrsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#endif + + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_TRSM] += t2; + +#endif + + + stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, ikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + nlb1 = lsub1[0] - 1; + lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; + luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ + + dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, lptr1, luptr1, xsup, + grid, Llu, send_req, stat); + } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + + } /* for lb ... */ } /* dLSUM_FMOD */ @@ -268,410 +274,145 @@ void dlsum_bmod SuperLUStat_t *stat ) { -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= U_i,k * X[k]. - */ - // // // // double alpha = 1.0, beta = 0.0; - // // // // int iam, iknsupc, knsupc, myrow, nsupr, p, pi; - // // // // int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, - // // // // j, jj, lk, lk1, nub, ub, uptr; - // // // // int_t *usub; - // // // // double *uval, *dest, *y; - // // // // int_t *lsub; - // // // // double *lusup; - // // // // int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - // // // // int_t *brecv = Llu->brecv; - // // // // int_t **bsendx_plist = Llu->bsendx_plist; - // // // // MPI_Status status; - // // // // int test_flag; - - // // // // iam = grid->iam; - // // // // myrow = MYROW( iam, grid ); - // // // // knsupc = SuperSize( k ); - // // // // lk = LBj( k, grid ); /* Local block number, column-wise. */ - // // // // nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - // // // // for (ub = 0; ub < nub; ++ub) { - // // // // ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ - // // // // usub = Llu->Ufstnz_br_ptr[ik]; - // // // // uval = Llu->Unzval_br_ptr[ik]; - // // // // i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ - // // // // i += UB_DESCRIPTOR; - // // // // il = LSUM_BLK( ik ); - // // // // gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ - // // // // iknsupc = SuperSize( gik ); - // // // // ikfrow = FstBlockC( gik ); - // // // // iklrow = FstBlockC( gik+1 ); - - // // // // RHS_ITERATE(j) { - // // // // dest = &lsum[il + j*iknsupc]; - // // // // y = &xk[j*knsupc]; - // // // // uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ - // // // // for (jj = 0; jj < knsupc; ++jj) { - // // // // fnz = usub[i + jj]; - // // // // if ( fnz < iklrow ) { /* Nonzero segment. */ - // // // // /* AXPY */ - // // // // for (irow = fnz; irow < iklrow; ++irow) - // // // // dest[irow - ikfrow] -= uval[uptr++] * y[jj]; - // // // // stat->ops[SOLVE] += 2 * (iklrow - fnz); - // // // // } - // // // // } /* for jj ... */ - // // // // } - - // // // // if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ - // // // // gikcol = PCOL( gik, grid ); - // // // // p = PNUM( myrow, gikcol, grid ); - // // // // if ( iam != p ) { -// // // // #ifdef ISEND_IRECV - // // // // MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm, - // // // // &send_req[Llu->SolveMsgSent++] ); -// // // // #else -// // // // #ifdef BSEND - // // // // MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // // // #else - // // // // MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // // // #endif -// // // // #endif -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - // // // // iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -// // // // #endif - // // // // } else { /* Diagonal process: X[i] += lsum[i]. */ - // // // // ii = X_BLK( ik ); - // // // // dest = &x[ii]; - // // // // RHS_ITERATE(j) - // // // // for (i = 0; i < iknsupc; ++i) - // // // // dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; - // // // // if ( !brecv[ik] ) { /* Becomes a leaf node. */ - // // // // bmod[ik] = -1; /* Do not solve X[k] in the future. */ - // // // // lk1 = LBj( gik, grid ); /* Local block number. */ - // // // // lsub = Llu->Lrowind_bc_ptr[lk1]; - // // // // lusup = Llu->Lnzval_bc_ptr[lk1]; - // // // // nsupr = lsub[1]; -// // // // #ifdef _CRAY - // // // // STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, - // // // // lusup, &nsupr, &x[ii], &iknsupc); -// // // // #elif defined (USE_VENDOR_BLAS) - // // // // dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - // // // // lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); -// // // // #else - // // // // dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, - // // // // lusup, &nsupr, &x[ii], &iknsupc); -// // // // #endif - // // // // stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Solve X[%2d]\n", iam, gik); -// // // // #endif - - // // // // /* - // // // // * Send Xk to process column Pc[k]. - // // // // */ - // // // // for (p = 0; p < grid->nprow; ++p) { - // // // // if ( bsendx_plist[lk1][p] != EMPTY ) { - // // // // pi = PNUM( p, gikcol, grid ); -// // // // #ifdef ISEND_IRECV - // // // // MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm, - // // // // &send_req[Llu->SolveMsgSent++] ); -// // // // #else -// // // // #ifdef BSEND - // // // // MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // // // #else - // // // // MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // // // #endif -// // // // #endif -// // // // #if ( DEBUGlevel>=2 ) - // // // // printf("(%2d) Sent X[%2.0f] to P %2d\n", - // // // // iam, x[ii-XK_H], pi); -// // // // #endif - // // // // } - // // // // } - // // // // /* - // // // // * Perform local block modifications. - // // // // */ - // // // // if ( Urbs[lk1] ) - // // // // dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, - // // // // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - // // // // send_req, stat); - // // // // } /* if brecv[ik] == 0 */ - // // // // } - // // // // } /* if bmod[ik] == 0 */ - - // // // // } /* for ub ... */ - -} /* dlSUM_BMOD */ - - - - -// /************************************************************************/ -// /*! \brief - // * - // *
- // * Purpose
- // * =======
- // *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
- // * 
- // */ -// void dlsum_fmod_inv -// /************************************************************************/ -// ( - // double *lsum, /* Sum of local modifications. */ - // double *x, /* X array (local) */ - // double *xk, /* X[k]. */ - // double *rtemp, /* Result of full matrix-vector multiply. */ - // int nrhs, /* Number of right-hand sides. */ - // int knsupc, /* Size of supernode k. */ - // int_t k, /* The k-th component of X. */ - // int_t *fmod, /* Modification count for L-solve. */ - // int_t nlb, /* Number of L blocks. */ - // int_t lptr, /* Starting position in lsub[*]. */ - // int_t luptr, /* Starting position in lusup[*]. */ - // int_t *xsup, - // gridinfo_t *grid, - // LocalLU_t *Llu, - // MPI_Request send_req[], /* input/output */ - // SuperLUStat_t *stat -// ) -// { - // double alpha = 1.0, beta = 0.0; - // double *lusup, *lusup1; - // double *dest; - // double *Linv;/* Inverse of diagonal block */ - // int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; - // int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel, lib; - // int_t *lsub, *lsub1, nlb1, lptr1, luptr1; - // int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - // int_t *frecv = Llu->frecv; - // int_t **fsendx_plist = Llu->fsendx_plist; - // MPI_Status status; - // int test_flag; - // yes_no_t done; - // BcTree *LBtree_ptr = Llu->LBtree_ptr; - // RdTree *LRtree_ptr = Llu->LRtree_ptr; - - -// #if ( PROFlevel>=1 ) - // double t1, t2; - // float msg_vol = 0, msg_cnt = 0; -// #endif - - - // iam = grid->iam; - // myrow = MYROW( iam, grid ); - // lk = LBj( k, grid ); /* Local block number, column-wise. */ - // lsub = Llu->Lrowind_bc_ptr[lk]; - // lusup = Llu->Lnzval_bc_ptr[lk]; - // nsupr = lsub[1]; - - // // printf("nlb: %5d\n",nlb); - // // fflush(stdout); - - - - - - // for (lb = 0; lb < nlb; ++lb) { - -// #if ( PROFlevel>=1 ) - // TIC(t1); -// #endif - - // ik = lsub[lptr]; /* Global block number, row-wise. */ - // nbrow = lsub[lptr+1]; -// #ifdef _CRAY - // SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, - // &alpha, &lusup[luptr], &nsupr, xk, - // &knsupc, &beta, rtemp, &nbrow ); -// #elif defined (USE_VENDOR_BLAS) - // dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - // &alpha, &lusup[luptr], &nsupr, xk, - // &knsupc, &beta, rtemp, &nbrow, 1, 1 ); -// #else - // dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, - // &alpha, &lusup[luptr], &nsupr, xk, - // &knsupc, &beta, rtemp, &nbrow ); -// #endif - // stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; - - // lk = LBi( ik, grid ); /* Local block number, row-wise. */ - // iknsupc = SuperSize( ik ); - // il = LSUM_BLK( lk ); - // dest = &lsum[il]; - // lptr += LB_DESCRIPTOR; - // rel = xsup[ik]; /* Global row index of block ik. */ - // RHS_ITERATE(j) - // for (i = 0; i < nbrow; ++i) { - // irow = lsub[lptr++] - rel; /* Relative row. */ - // dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; - // } - // luptr += nbrow; - - - -// #if ( PROFlevel>=1 ) - // TOC(t2, t1); - // stat->utime[SOL_GEMM] += t2; - -// #endif - - // if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - // ikcol = PCOL( ik, grid ); - // p = PNUM( myrow, ikcol, grid ); - // if ( iam != p ) { - // if(frecv[lk]==0){ - // fmod[lk] = -1; - // RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H]); - // } - -// // #ifdef ISEND_IRECV - // // MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // MPI_DOUBLE, p, LSUM, grid->comm, - // // &send_req[Llu->SolveMsgSent++] ); -// // #else -// // #ifdef BSEND - // // MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // #else - // // MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - // // MPI_DOUBLE, p, LSUM, grid->comm ); -// // #endif -// // #endif -// // #if ( DEBUGlevel>=2 ) - // // printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - // // iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); -// // #endif - - - - - // } else { /* Diagonal process: X[i] += lsum[i]. */ - // ii = X_BLK( lk ); - // RHS_ITERATE(j) - // for (i = 0; i < iknsupc; ++i) - // x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; - // if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ - // fmod[lk] = -1; /* Do not solve X[k] in the future. */ - // lk = LBj( ik, grid );/* Local block number, column-wise. */ - // lsub1 = Llu->Lrowind_bc_ptr[lk]; - // lusup1 = Llu->Lnzval_bc_ptr[lk]; - // nsupr1 = lsub1[1]; - - -// #if ( PROFlevel>=1 ) - // TIC(t1); -// #endif - - // if(Llu->inv == 1){ - // Linv = Llu->Linv_bc_ptr[lk]; -// #ifdef _CRAY - // SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - // &alpha, Linv, &iknsupc, &x[ii], - // &iknsupc, &beta, rtemp, &iknsupc ); -// #elif defined (USE_VENDOR_BLAS) - // dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - // &alpha, Linv, &iknsupc, &x[ii], - // &iknsupc, &beta, rtemp, &iknsupc, 1, 1 ); -// #else - // dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - // &alpha, Linv, &iknsupc, &x[ii], - // &iknsupc, &beta, rtemp, &iknsupc ); -// #endif - // for (i=0 ; i=1 ) - // TOC(t2, t1); - // stat->utime[SOL_TRSM] += t2; - -// #endif - - - // stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; -// #if ( DEBUGlevel>=2 ) - // printf("(%2d) Solve X[%2d]\n", iam, ik); -// #endif - - // /* - // * Send Xk to process column Pc[k]. - // */ - - // if(LBtree_ptr[lk]!=NULL) - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); - - - // // for (p = 0; p < grid->nprow; ++p) { - // // if ( fsendx_plist[lk][p] != EMPTY ) { - // // pi = PNUM( p, ikcol, grid ); -// // #ifdef ISEND_IRECV - // // MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // MPI_DOUBLE, pi, Xk, grid->comm, - // // &send_req[Llu->SolveMsgSent++] ); -// // #else -// // #ifdef BSEND - // // MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // #else - // // MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - // // MPI_DOUBLE, pi, Xk, grid->comm ); -// // #endif -// // #endif -// // #if ( DEBUGlevel>=2 ) - // // printf("(%2d) Sent X[%2.0f] to P %2d\n", - // // iam, x[ii-XK_H], pi); -// // #endif - // // } - // // } - - - - - - // /* - // * Perform local block modifications. - // */ - // nlb1 = lsub1[0] - 1; - // lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - // luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - // dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - // fmod, nlb1, lptr1, luptr1, xsup, - // grid, Llu, send_req, stat); - // } /* if frecv[lk] == 0 */ - // } /* if iam == p */ - // } /* if fmod[lk] == 0 */ - - // } /* for lb ... */ - -// } /* dLSUM_FMOD_inv */ - - - - + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + double alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + double *uval, *dest, *y; + int_t *lsub; + double *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + MPI_Status status; + int test_flag; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_DOUBLE, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( ik ); + dest = &x[ii]; + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( !brecv[ik] ) { /* Becomes a leaf node. */ + bmod[ik] = -1; /* Do not solve X[k] in the future. */ + lk1 = LBj( gik, grid ); /* Local block number. */ + lsub = Llu->Lrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + dtrsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#endif + stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( bsendx_plist[lk1][p] != EMPTY ) { + pi = PNUM( p, gikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_DOUBLE, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + if ( Urbs[lk1] ) + dlsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + } /* for ub ... */ +} /* dlSUM_BMOD */ /************************************************************************/ @@ -695,284 +436,950 @@ void dlsum_fmod_inv int_t k, /* The k-th component of X. */ int_t *fmod, /* Modification count for L-solve. */ int_t nlb, /* Number of L blocks. */ - int_t lptr, /* Starting position in lsub[*]. */ - int_t luptr, /* Starting position in lusup[*]. */ int_t *xsup, gridinfo_t *grid, LocalLU_t *Llu, - MPI_Request send_req[], /* input/output */ - SuperLUStat_t *stat -) + SuperLUStat_t **stat, + int_t *leaf_send, + int_t *nleaf_send, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel + ) { - double alpha = 1.0, beta = 0.0,malpha=-1.0; - double *lusup, *lusup1; - double *dest; - double *Linv;/* Inverse of diagonal block */ - int iam, iknsupc, myrow, krow, nbrow, nsupr, nsupr1, p, pi, idx_r; - int_t i, ii, ik, il, ikcol, irow, j, lb, lk, rel, lib; - int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *frecv = Llu->frecv; - int_t **fsendx_plist = Llu->fsendx_plist; - int_t luptr_tmp,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, m, idx_l; - int thread_id; + double alpha = 1.0, beta = 0.0,malpha=-1.0; + double *lusup, *lusup1; + double *dest; + double *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id,thread_id1,num_thread; flops_t ops_loc=0.0; - MPI_Status status; - int test_flag; + MPI_Status status; + int test_flag; yes_no_t done; BcTree *LBtree_ptr = Llu->LBtree_ptr; RdTree *LRtree_ptr = Llu->LRtree_ptr; int_t* idx_lsum,idx_lsum1; - -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; -#endif + double *rtemp_loc; + int_t ldalsum,maxsuper,aln_d; + int dword = sizeof (double); + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + + maxsuper = sp_ienv_dist(3); + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + ldalsum=Llu->ldalsum; + rtemp_loc = &rtemp[sizertemp* thread_id]; -if(nlb>0){ + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif -#if ( PROFlevel>=1 ) - TIC(t1); -#endif + if(nlb>0){ + maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); - - iam = grid->iam; - myrow = MYROW( iam, grid ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - lsub = Llu->Lrowind_bc_ptr[lk]; - lusup = Llu->Lnzval_bc_ptr[lk]; - lloc = Llu->Lindval_loc_bc_ptr[lk]; - idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; - - nsupr = lsub[1]; - - // printf("nlb: %5d lk: %5d\n",nlb,lk); - // fflush(stdout); + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + lsub = Llu->Lrowind_bc_ptr[lk]; - krow = PROW( k, grid ); - if(myrow==krow){ - idx_n = 1; - idx_i = nlb+2; - idx_v = 2*nlb+3; - luptr_tmp = lloc[idx_v]; - m = nsupr-knsupc; - }else{ - idx_n = 0; - idx_i = nlb; - idx_v = 2*nlb; - luptr_tmp = lloc[idx_v]; - m = nsupr; - } - - // printf("m %5d k %5d \n",m,k); + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); // fflush(stdout); + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>8*maxsuper){ + // if(m<1){ + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp], &nsupr, xk, - &knsupc, &beta, rtemp, &m ); + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &m, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp], &nsupr, xk, - &knsupc, &beta, rtemp, &m, 1, 1 ); + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); #else - dgemm_( "N", "N", &m, &nrhs, &knsupc, - &alpha, &lusup[luptr_tmp], &nsupr, xk, - &knsupc, &beta, rtemp, &m ); -#endif + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); +#endif + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ - stat->ops[SOLVE] += 2 * m * nrhs * knsupc; - - - for (i = 0; i < m*nrhs; ++i) { - lsum[idx_lsum[i]] -=rtemp[i]; - } + lk = LBi( ik, grid ); /* Local block number, row-wise. */ - - - - -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[SOL_GEMM] += t2; - -#endif - - - - + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); - - // idx_r=0; - // for (lb = 0; lb < nlb; ++lb) { - - // // printf("ind: %5d val: %5d\n",(lb+1)+(nlb+1), (lb+1)+2*(nlb+1)); - // // fflush(stdout); + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - // lptr1_tmp = lloc[lb+idx_i]; + lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } - // ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ - // nbrow = lsub[lptr1_tmp+1]; - +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + for (lb=lbstart;lb=1 ) - // TOC(t2, t1); - // stat->utime[SOL_GEMM] += t2; - -// #endif - - - for (lb = 0; lb < nlb; ++lb) { - lk = lloc[lb+idx_n]; - if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ - - lptr1_tmp = lloc[lb+idx_i]; - // luptr_tmp = lloc[lb+idx_v]; - - ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ - lk = LBi( ik, grid ); /* Local block number, row-wise. */ - - iknsupc = SuperSize( ik ); - il = LSUM_BLK( lk ); + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + lk = LBi( ik, grid ); /* Local block number, row-wise. */ - nbrow = lsub[lptr1_tmp+1]; - - ikcol = PCOL( ik, grid ); - p = PNUM( myrow, ikcol, grid ); - if ( iam != p ) { - if(frecv[lk]==0){ - fmod[lk] = -1; - RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H]); - } + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + ikcol = PCOL( ik, grid ); + p = PNUM( myrow, ikcol, grid ); + if ( iam != p ) { - } else { /* Diagonal process: X[i] += lsum[i]. */ + for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; - - #if ( PROFlevel>=1 ) - TIC(t1); -#endif - - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat->utime[SOL_TRSM] += t2; - -#endif + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; - stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - - /* - * Send Xk to process column Pc[k]. - */ - - if(LBtree_ptr[lk]!=NULL) - BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); - - /* - * Perform local block modifications. - */ - nlb1 = lsub1[0] - 1; - lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; - luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ - - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, - fmod, nlb1, lptr1, luptr1, xsup, - grid, Llu, send_req, stat); + /* + * Send Xk to process column Pc[k]. + */ - - - } /* if frecv[lk] == 0 */ - } /* if iam == p */ - } /* if fmod[lk] == 0 */ + if(LBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[nleaf_send_tmp-1] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + + nlb1 = lsub1[0] - 1; + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } - } /* for lb ... */ - - } /* if nlb>0*/ -} /* dLSUM_FMOD_inv */ + } + } + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + dgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + // for (i = 0; i < m*nrhs; ++i) { + // lsum[idx_lsum[i]+sizelsum*thread_id] -=rtemp_loc[i]; + // } + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + + + + // TOC(t3, t1); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; + +#endif + + thread_id1 = omp_get_thread_num (); + rtemp_loc = &rtemp[sizertemp* thread_id1]; + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[nleaf_send_tmp-1] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + + { + nlb1 = lsub1[0] - 1; + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } + +} + + +stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + +} /* if nlb>0*/ +} /* dLSUM_FMOD_inv */ + + + + + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void dlsum_fmod_inv_master +/************************************************************************/ +( + double *lsum, /* Sum of local modifications. */ + double *x, /* X array (local) */ + double *xk, /* X[k]. */ + double *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel + ) +{ + double alpha = 1.0, beta = 0.0,malpha=-1.0; + double *lusup, *lusup1; + double *dest; + double *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id,thread_id1,num_thread; + int m; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + double *rtemp_loc; + int_t ldalsum,maxsuper,aln_d; + int dword = sizeof (double); + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + + maxsuper = sp_ienv_dist(3); + + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + + if(nlb>0){ + + maxrecvsz = sp_ienv_dist(3) * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + + lsub = Llu->Lrowind_bc_ptr[lk]; + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>4*maxsuper || nrhs>10){ + // if(m<1){ + + + + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + + + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); +#else + dgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); +#endif + + + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + } + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + dgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + // for (i = 0; i < m*nrhs; ++i) { + // lsum[idx_lsum[i]] -=rtemp_loc[i]; + // } + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; + +#endif + + } + + // TOC(t3, t1); + + + + thread_id1 = omp_get_thread_num (); + + + + + rtemp_loc = &rtemp[sizertemp* thread_id1]; + + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + + + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL) + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H]); + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,send_req,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + nlb1 = lsub1[0] - 1; + + + dlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat,sizelsum,sizertemp,1+recurlevel); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } + + + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + + } /* if nlb>0*/ +} /* dlsum_fmod_inv_master */ @@ -989,175 +1396,749 @@ void dlsum_bmod_inv int_t k, /* The k-th component of X. */ int_t *bmod, /* Modification count for L-solve. */ int_t *Urbs, /* Number of row blocks in each block column of U.*/ + int_t *Urbs2, Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ int_t *xsup, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request send_req[], /* input/output */ - SuperLUStat_t *stat + SuperLUStat_t **stat, + int_t* root_send, + int_t* nroot_send, + int_t sizelsum, + int_t sizertemp ) { -/* - * Purpose - * ======= - * Perform local block modifications: lsum[i] -= U_i,k * X[k]. - */ - double alpha = 1.0, beta = 0.0; - int iam, iknsupc, knsupc, myrow, nsupr, p, pi; - int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, - j, jj, lk, lk1, nub, ub, uptr; - int_t *usub; - double *uval, *dest, *y; - int_t *lsub; - double *lusup; - int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ - int_t *brecv = Llu->brecv; - int_t **bsendx_plist = Llu->bsendx_plist; - MPI_Status status; - int test_flag; - - double *Uinv;/* Inverse of diagonal block */ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + double alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + double *uval, *dest, *y; + int_t *lsub; + double *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id,thread_id1,num_thread; + double *rtemp_loc; + int_t nroot_send_tmp; + double *Uinv;/* Inverse of diagonal block */ + + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); +#else + thread_id = 0; + num_thread = 1; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + - iam = grid->iam; - myrow = MYROW( iam, grid ); - knsupc = SuperSize( k ); - lk = LBj( k, grid ); /* Local block number, column-wise. */ - nub = Urbs[lk]; /* Number of U blocks in block column lk */ - - for (ub = 0; ub < nub; ++ub) { - ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ - usub = Llu->Ufstnz_br_ptr[ik]; - uval = Llu->Unzval_br_ptr[ik]; - i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ - i += UB_DESCRIPTOR; - il = LSUM_BLK( ik ); - gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ - iknsupc = SuperSize( gik ); - ikfrow = FstBlockC( gik ); - iklrow = FstBlockC( gik+1 ); - - RHS_ITERATE(j) { - dest = &lsum[il + j*iknsupc]; - y = &xk[j*knsupc]; - uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ - for (jj = 0; jj < knsupc; ++jj) { - fnz = usub[i + jj]; - if ( fnz < iklrow ) { /* Nonzero segment. */ - /* AXPY */ - for (irow = fnz; irow < iklrow; ++irow) - dest[irow - ikfrow] -= uval[uptr++] * y[jj]; - stat->ops[SOLVE] += 2 * (iklrow - fnz); + + // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); + // fflush(stdout); + + if(nub>num_thread){ + // // // // if(Urbs2[lk]>num_thread){ + // if(Urbs2[lk]>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup +#endif + for (nn=0;nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; iUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; inum_thread){ + #ifdef _OPENMP + #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,Urbs2,lsum,stat,nrhs,grid,xsup) untied + #endif + dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat, root_send, nroot_send, sizelsum,sizertemp); + }else{ + dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs,Urbs2, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat, root_send, nroot_send, sizelsum,sizertemp); + } + + // } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ } - if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ - gikcol = PCOL( gik, grid ); - p = PNUM( myrow, gikcol, grid ); - if ( iam != p ) { -#ifdef ISEND_IRECV - MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm, - &send_req[Llu->SolveMsgSent++] ); +} /* dlSUM_BMOD_inv */ + + + + + + + +/************************************************************************/ +void dlsum_bmod_inv_master +/************************************************************************/ +( + double *lsum, /* Sum of local modifications. */ + double *x, /* X array (local). */ + double *xk, /* X[k]. */ + double *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + int_t *Urbs2, + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + LocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + double alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + double *uval, *dest, *y; + int_t *lsub; + double *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id,thread_id1,num_thread; + double *rtemp_loc; + + double *Uinv;/* Inverse of diagonal block */ + + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + +#ifdef _OPENMP + thread_id = omp_get_thread_num (); + num_thread = omp_get_num_threads (); #else -#ifdef BSEND - MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, - MPI_DOUBLE, p, LSUM, grid->comm ); + thread_id = 0; + num_thread = 1; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + + + // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); + // fflush(stdout); + + if(nub>num_thread){ + // if(nub>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop firstprivate (send_req,stat) private (thread_id1,nn,lbstart,lbend,ub,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied +#endif + for (nn=0;nncomm ); -#endif -#endif + thread_id1 = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id1]; + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + if(nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + }else{ +#ifdef _OPENMP + thread_id1 = omp_get_thread_num (); +#else + thread_id1 = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id1]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + + +#ifdef _OPENMP + thread_id1 = omp_get_thread_num (); +#else + thread_id1 = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id1]; + for (ub = 0; ub < nub; ++ub){ + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[ik]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) - printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", - iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); #endif - } else { /* Diagonal process: X[i] += lsum[i]. */ - ii = X_BLK( ik ); - dest = &x[ii]; - RHS_ITERATE(j) - for (i = 0; i < iknsupc; ++i) - dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; - if ( !brecv[ik] ) { /* Becomes a leaf node. */ - bmod[ik] = -1; /* Do not solve X[k] in the future. */ - lk1 = LBj( gik, grid ); /* Local block number. */ - lsub = Llu->Lrowind_bc_ptr[lk1]; - lusup = Llu->Lnzval_bc_ptr[lk1]; - nsupr = lsub[1]; - - if(Llu->inv == 1){ - Uinv = Llu->Uinv_bc_ptr[lk1]; + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Uinv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Uinv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Uinv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp, &iknsupc ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - for (i=0 ; iops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; +#endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, gik); + printf("(%2d) Solve X[%2d]\n", iam, gik); #endif - /* - * Send Xk to process column Pc[k]. - */ - for (p = 0; p < grid->nprow; ++p) { - if ( bsendx_plist[lk1][p] != EMPTY ) { - pi = PNUM( p, gikcol, grid ); -#ifdef ISEND_IRECV - MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm, - &send_req[Llu->SolveMsgSent++] ); -#else -#ifdef BSEND - MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#else - MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, - MPI_DOUBLE, pi, Xk, grid->comm ); -#endif -#endif -#if ( DEBUGlevel>=2 ) - printf("(%2d) Sent X[%2.0f] to P %2d\n", - iam, x[ii-XK_H], pi); -#endif + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i @@ -1192,7 +1196,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t Glu_freeable_n; LocalLU_t *Llu = LUstruct->Llu; - int_t bnnz, fsupc, i, irow, istart, j, jb,ib, jj, k, + int_t bnnz, fsupc, i, irow, istart, j, jb,ib, jj, k, k1, len, len1, nsupc, nsupc_gb, ii, nprocs; int_t lib; /* local block row number */ int_t nlb; /* local block rows*/ @@ -1201,7 +1205,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t nrbu; /* number of U blocks in current block column */ int_t gb; /* global block number; 0 < gb <= nsuper */ int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ - int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; + int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p; int_t mybufmax[NBUFFERS]; NRformat_loc *Astore; double *a; @@ -1209,7 +1213,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; double *asup_val, *ainf_val; int_t *xsup, *supno; /* supernode and column mapping */ - int_t *lsub, *xlsub, *usub, *xusub; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; int_t next_ind; /* next available position in index[*] */ int_t next_val; /* next available position in nzval[*] */ @@ -1222,9 +1226,22 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + double *lusup_srt; /* nonzero values in L and U */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; @@ -1251,10 +1268,8 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, int_t *LUb_number; /* global block number; size nsupers_ij */ int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ - double *dense, *dense_col; /* SPA */ - double zero = 0.0; - int_t ldaspa; /* LDA of SPA */ - int_t iword, dword; + + float memStrLU, memA, memDist = 0.; /* memory used for redistributing the data, which does not include the memory for the numerical values @@ -1262,23 +1277,35 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, float memNLU = 0.; /* memory allocated for storing the numerical values of L and U, that will be used in the numeric factorization (positive number) */ - - BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ - RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ - int_t msgsize; - int_t *ActiveFlag; - int Iactive; - int_t *ranks; - int_t *idxs; - int_t **nzrows; - double rseed; - int_t rank_cnt,rank_cnt_ref,Root; - int_t *mod_bit; - int_t *frecv; - + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + double *dense, *dense_col; /* SPA */ + double zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu, aln_i; + int_t nub; + #if ( PRNTlevel>=1 ) int_t nLblocks = 0, nUblocks = 0; #endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif /* Initialization. */ iam = grid->iam; @@ -1294,6 +1321,8 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, iword = sizeof(int_t); dword = sizeof(double); + aln_i = ceil(CACHELINE/(double)iword); + if (fact == SamePattern_SameRowPerm) { ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); } @@ -1473,11 +1502,18 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); return (memDist + memNLU); } - memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*); + + if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){ + fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[]."); + return (memDist + memNLU); + } + + memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*); Lnzval_bc_ptr[nsupers_j-1] = NULL; Linv_bc_ptr[nsupers_j-1] = NULL; Uinv_bc_ptr[nsupers_j-1] = NULL; Lrowind_bc_ptr[nsupers_j-1] = NULL; + Lindval_loc_bc_ptr[nsupers_j-1] = NULL; /* These lists of processes will be used for triangular solves. */ if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { @@ -1776,8 +1812,13 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, fprintf(stderr, "Malloc fails for Uinv_bc_ptr[*][] col block " IFMT, jb); return (memDist + memNLU); } - + memNLU += len1*iword + len*nsupc*dword; + + if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(((nrbl*3 + (aln_i - 1)) / aln_i) * aln_i)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]"); + + lusup = Lnzval_bc_ptr[ljb_j]; mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); @@ -1791,6 +1832,11 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, gb = LUb_number[k]; lb = LBi( gb, grid ); len = LUb_length[lb]; + + Lindval_loc_bc_ptr[ljb_j][k] = lb; + Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind; + Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val; + LUb_length[lb] = 0; index[next_ind++] = gb; /* Descriptor */ index[next_ind++] = len; @@ -1819,11 +1865,65 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, } } } /* for i ... */ + + + /* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb_j][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb_j]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = doubleMalloc_dist(len*nsupc))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jjnpcol );/* Number of local block columns */ - // // if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) - // // ABORT("Malloc fails for LBtree_ptr[]."); - // // if ( !(ActiveFlag = intCalloc_dist(grid->nprow)) ) - // // ABORT("Calloc fails for ActiveFlag[]."); - // // if ( !(ranks = intCalloc_dist(grid->nprow)) ) - // // ABORT("Calloc fails for ranks[]."); - - - // // for (ljb = 0; ljb nprow;++j)ActiveFlag[j]=0; - // // for (j=0;jnprow;++j)ranks[j]=-1; - // // Root=-1; - // // Iactive = 0; - // // fsupc = FstBlockC( jb ); - // // nsupc = SuperSize( jb ); - // // ljb = LBj( jb, grid ); /* Local block number */ - // // LBtree_ptr[ljb]=NULL; - - // // // printf("Iamhere1111 %5d\n",jb); - // // // fflush(stdout); - // // istart = xlsub[fsupc]; - // // for (i = istart; i < xlsub[fsupc+1]; ++i) { - // // irow = lsub[i]; - // // gb = BlockNum( irow ); - // // pr = PROW( gb, grid ); - // // ActiveFlag[pr]=1; - // // if(gb==jb)Root=pr; - // // if(myrow==pr)Iactive=1; - - // // } /* for j ... */ - // // if(Iactive==1){ - - - // // assert( Root>-1 ); - // // rank_cnt = 1; - // // ranks[0]=Root; - // // for (j = 0; j < grid->nprow; ++j){ - // // if(ActiveFlag[j]==1 && j!=Root){ - // // ranks[rank_cnt]=j; - // // ++rank_cnt; - // // } - // // } - - // // if(rank_cnt>1){ - - // // // rseed=rand(); - // // rseed=1.0; - // // msgsize = SuperSize( jb )*nrhs+XK_H; - // // LBtree_ptr[ljb] = BcTree_Create(grid->cscp.comm, ranks, rank_cnt, msgsize,rseed); - // // BcTree_SetTag(LBtree_ptr[ljb],jb); - - // // // TreeTest(LBtree_ptr[ljb]); - -// // // #if ( PRNTlevel>=1 ) - // // if(Root==myrow){ - // // rank_cnt_ref=1; - // // for (j = 0; j < grid->nprow; ++j) { - // // if ( fsendx_plist[ljb][j] != EMPTY ) { - // // ++rank_cnt_ref; - // // } - // // } - // // assert(rank_cnt==rank_cnt_ref); - - // // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); - // // fflush(stdout); - // // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); - // // // for(j=0;jnprow );/* Number of local block rows */ - // // if ( !(mod_bit = intMalloc_dist(nlb)) ) - // // ABORT("Malloc fails for mod_bit[]."); - // // if ( !(frecv = intMalloc_dist(nlb)) ) - // // ABORT("Malloc fails for frecv[]."); - - // // for (k = 0; k < nlb; ++k) mod_bit[k] = 0; - // // for (k = 0; k < nsupers; ++k) { - // // pr = PROW( k, grid ); - // // if ( myrow == pr ) { - // // lib = LBi( k, grid ); /* local block number */ - // // kcol = PCOL( k, grid ); - // // if (mycol == kcol || fmod[lib] ) - // // mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - // // } - // // } - // // /* Every process receives the count, but it is only useful on the - // // diagonal processes. */ - // // MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); - - - - // // k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ - // // if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) - // // ABORT("Malloc fails for LRtree_ptr[]."); - // // if ( !(ActiveFlag = intCalloc_dist(grid->npcol)) ) - // // ABORT("Calloc fails for ActiveFlag[]."); - // // if ( !(ranks = intCalloc_dist(grid->npcol)) ) - // // ABORT("Calloc fails for ranks[]."); - - // // if ( !(idxs = intCalloc_dist(nsupers)) ) - // // ABORT("Calloc fails for idxs[]."); - - // // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) - // // ABORT("Malloc fails for nzrows[]."); - - // // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - // // fsupc = FstBlockC( jb ); - // // len=xlsub[fsupc+1]-xlsub[fsupc]; - // // idxs[jb] = len-1; - // // if(len>0){ - // // if ( !(nzrows[jb] = intMalloc_dist(len)) ) - // // ABORT("Malloc fails for nzrows[jb]"); - // // for(i=xlsub[fsupc];i0 ; --ib) { - // // pr = PROW( ib, grid ); - // // if ( myrow == pr ) { /* Block row ib in my process row */ - - // // for (j=0;jnpcol;++j)ActiveFlag[j]=0; - // // for (j=0;jnpcol;++j)ranks[j]=-1; - // // Root=-1; - // // Iactive = 0; - // // lib = LBi( ib, grid ); /* Local block number */ - // // LRtree_ptr[lib]=NULL; - - // // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ - // // fsupc = FstBlockC( jb ); - // // if(idxs[jb]>=0){ /* if column jb has not been iterated through */ - // // irow = nzrows[jb][idxs[jb]]; - // // gb = BlockNum( irow ); - - // // while(gb>=ib){ - // // if(gb==ib){ /* (ib,jb) nonempty*/ - // // pc = PCOL( jb, grid ); - // // ActiveFlag[pc]=1; - // // if(ib==jb)Root=pc; - // // if(mycol==pc)Iactive=1; - // // } - // // if(idxs[jb]-1>=0){ - // // --idxs[jb]; - // // irow = nzrows[jb][idxs[jb]]; - // // gb = BlockNum( irow );} - // // else{break;} - // // } - // // } - // // } - - - // // if(Iactive==1){ - // // assert( Root>-1 ); - // // rank_cnt = 1; - // // ranks[0]=Root; - // // for (j = 0; j < grid->npcol; ++j){ - // // if(ActiveFlag[j]==1 && j!=Root){ - // // ranks[rank_cnt]=j; - // // ++rank_cnt; - // // } - // // } - // // if(rank_cnt>1){ - // // // rseed=rand(); - // // rseed=1.0; - // // msgsize = SuperSize( ib )*nrhs+LSUM_H; - // // // LRtree_ptr[lib] = RdTree_Create(grid->rscp.comm, ranks, rank_cnt, msgsize,rseed); - // // // RdTree_SetTag(LRtree_ptr[lib], ib+nsupers); - - - -// // // #if ( PRNTlevel>=1 ) - // // if(Root==mycol){ - // // assert(rank_cnt==frecv[lib]); - // // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); - // // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); - // // // // for(j=0;jnpcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + + + + ///////////////////////////////////////////////////////////////// + + if(LSUM=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb]); + BcTree_SetTag(LBtree_ptr[ljb],BC_L); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb])); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + + + for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnpcol]=MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib]); + RdTree_SetTag(LRtree_ptr[lib], RD_L); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + + #if ( PRNTlevel>=1 ) + if(Root==mycol){ + assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + + // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); + // fflush(stdout); + + if(ibnprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + } + } /* for i ... */ + pr = PROW( ib, grid ); // take care of diagonal node stored as L + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ljb = LBj( ib, grid ); /* local block number */ + ActiveFlagAll[pr+ljb*grid->nprow]=MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); + // fflush(stdout); + } + } + } + + // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); + // fflush(stdout); + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); + fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb]); + BcTree_SetTag(UBtree_ptr[ljb],BC_U); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + if(ibnpcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } /* for i ... */ + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ActiveFlagAll[pc+lib*grid->npcol]=MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib]); + RdTree_SetTag(URtree_ptr[lib], RD_U); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + + /* Free the memory used for storing L and U */ + SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); + if (lsub != NULL) + SUPERLU_FREE(lsub); + if (usub != NULL) + SUPERLU_FREE(usub); + SUPERLU_FREE(nnzToRecv); SUPERLU_FREE(ptrToRecv); SUPERLU_FREE(nnzToSend); @@ -2219,6 +2761,7 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, SUPERLU_FREE(recvBuf); Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; Llu->Linv_bc_ptr = Linv_bc_ptr; Llu->Uinv_bc_ptr = Uinv_bc_ptr; @@ -2240,6 +2783,11 @@ ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, LUstruct->Glu_persist = Glu_persist; Llu->LRtree_ptr = LRtree_ptr; Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", diff --git a/SRC/pdutil.c b/SRC/pdutil.c index 05975b0d..a27f2231 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -533,6 +533,17 @@ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + fflush(stdout); + + // while(1); + + // if(err>1e-5){ + // if( !iam ) printf("Wrong solution! \n"); + // fflush(stdout); + // while(1); + + // ABORT("Wrong solution! \n"); +// } } } diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index f4d18c91..7e6ccf23 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -100,7 +100,6 @@ typedef struct { int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ - /*********************/ /* The following variables are used in the hybrid solver */ @@ -298,13 +297,21 @@ extern void dlsum_bmod(double *, double *, double *, int_t **, int_t *, gridinfo_t *, LocalLU_t *, MPI_Request [], SuperLUStat_t *); extern void dlsum_fmod_inv(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, int_t, int_t, + int, int, int_t , int_t *, int_t, int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); + SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t); +extern void dlsum_fmod_inv_master(double *, double *, double *, double *, + int, int, int_t , int_t *, int_t, + int_t *, gridinfo_t *, LocalLU_t *, + SuperLUStat_t **, int_t, int_t, int_t); extern void dlsum_bmod_inv(double *, double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **, int_t **, int_t *, gridinfo_t *, LocalLU_t *, - MPI_Request [], SuperLUStat_t *); + MPI_Request [], SuperLUStat_t **, int_t *, int_t *, int_t, int_t); +extern void dlsum_bmod_inv_master(double *, double *, double *, double *, + int, int_t, int_t *, int_t *, int_t *, Ucb_indptr_t **, + int_t **, int_t *, gridinfo_t *, LocalLU_t *, + MPI_Request [], SuperLUStat_t **, int_t, int_t); extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, double [], int_t, double [], int_t, int, @@ -347,7 +354,8 @@ extern void dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *, double **, int_t **, int_t **); extern void dreadMM_dist(FILE *, int_t *, int_t *, int_t *, double **, int_t **, int_t **); - +extern int dread_binary(FILE *, int_t *, int_t *, int_t *, + double **, int_t **, int_t **); /* Distribute the data for numerical factorization */ extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *, ScalePermstruct_t *, Pslu_freeable_t *, diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 409df88b..f2738a15 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -42,7 +42,12 @@ at the top-level directory. #include #include #include +#include +#include +#if ( VTUNE>=1 ) +#include +#endif /************************************************************************* * Constants **************************************************************************/ @@ -75,6 +80,21 @@ at the top-level directory. #define IFMT "%8d" #endif + + + +// /* Define atomic int_t */ +// #ifdef _CRAY + // typedef atomic_short int_t_ato ; +// #elif defined (_LONGINT) + // typedef atomic_llong int_t_ato ; +// #else /* Default */ + // typedef atomic_int int_t_ato ; +// #endif + + + + #include "superlu_enum_consts.h" #include "Cnames.h" #include "supermatrix.h" @@ -167,6 +187,12 @@ at the top-level directory. #define Yk 22 #define LSUM 1000000 /* for now, make sure it's larger than nsupers*/ + +static const int BC_L=1; /* MPI tag for x in L-solve*/ +static const int RD_L=2; /* MPI tag for lsum in L-solve*/ +static const int BC_U=3; /* MPI tag for x in U-solve*/ +static const int RD_U=4; /* MPI tag for lsum in U-solve*/ + /* * Communication scopes */ @@ -792,47 +818,32 @@ typedef void* StdList; // typedef enum {NO, YES} yes_no_t; extern RdTree RdTree_Create(MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, double rseed); -extern void RdTree_Testsome(StdList TreeIdx, RdTree* ArrTrees, int* Outcount, int* FinishedTrees); -extern yes_no_t RdTree_Progress(RdTree Tree); -extern void RdTree_PostRecv(RdTree Tree); -extern void RdTree_SetDataReady(RdTree Tree); -extern void RdTree_SetLocalBuffer(RdTree Tree, void* localBuffer); -extern void RdTree_CleanupBuffers(RdTree Tree); -extern void RdTree_Reset(RdTree Tree); -extern void RdTree_AllocRecvBuffers(RdTree Tree); +extern void RdTree_Destroy(RdTree Tree); extern void RdTree_SetTag(RdTree Tree, int tag); -extern int RdTree_GetTag(RdTree Tree); extern yes_no_t RdTree_IsRoot(RdTree Tree); -extern yes_no_t RdTree_IsReady(RdTree Tree); -extern yes_no_t RdTree_StartForward(RdTree Tree); extern void RdTree_forwardMessageSimple(RdTree Tree, void* localBuffer); extern void RdTree_allocateRequest(RdTree Tree); extern int RdTree_GetDestCount(RdTree Tree); extern void RdTree_waitSendRequest(RdTree Tree); -extern void BcTree_AllocateBuffer(BcTree Tree); extern BcTree BcTree_Create(MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, double rseed); -extern void BcTree_Testsome(StdList TreeIdx, BcTree* ArrTrees, int *Outcount, int* FinishedTrees); -extern yes_no_t BcTree_Progress(BcTree Tree); -// extern int_t BcTree_Iprobe(BcTree Tree, MPI_Status* status); -extern void BcTree_SetDataReady(BcTree Tree); -extern void BcTree_SetLocalBuffer(BcTree Tree, void* localBuffer); -extern void BcTree_CleanupBuffers(BcTree Tree); -extern void BcTree_Reset(BcTree Tree); +extern void BcTree_Destroy(BcTree Tree); extern void BcTree_SetTag(BcTree Tree, int tag); extern yes_no_t BcTree_IsRoot(BcTree Tree); -extern yes_no_t BcTree_StartForward(BcTree Tree); extern void BcTree_forwardMessageSimple(BcTree Tree, void* localBuffer); extern void BcTree_allocateRequest(BcTree Tree); extern int BcTree_getDestCount(BcTree Tree); extern void BcTree_waitSendRequest(BcTree Tree); + +extern StdList StdList_Init(); +extern void StdList_Pushback(StdList lst, int_t dat); +extern void StdList_Pushfront(StdList lst, int_t dat); +extern int_t StdList_Popfront(StdList lst); +extern yes_no_t StdList_Find(StdList lst, int_t dat); +extern int_t StdList_Size(StdList lst); +yes_no_t StdList_Empty(StdList lst); -extern void TreeTest(void* tree); -extern StdList StdList_Init(); -extern void StdList_Pushback(StdList lst, int dat); -extern yes_no_t StdList_Find(StdList lst, int dat); -extern int StdList_Size(StdList lst); #ifdef __cplusplus } diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h index b55bac2f..7e62d68d 100644 --- a/SRC/superlu_enum_consts.h +++ b/SRC/superlu_enum_consts.h @@ -82,6 +82,4 @@ typedef enum { NPHASES /* total number of phases */ } PhaseType; - - #endif /* __SUPERLU_ENUM_CONSTS */ diff --git a/SRC/superlu_timer.c b/SRC/superlu_timer.c index 6f6e682d..98e93772 100644 --- a/SRC/superlu_timer.c +++ b/SRC/superlu_timer.c @@ -71,7 +71,14 @@ double SuperLU_timer_() double SuperLU_timer_() { + +#ifdef _OPENMP + return omp_get_wtime(); +#else return MPI_Wtime(); +#endif + + } #endif diff --git a/SRC/timer.h b/SRC/timer.h index 2b5ce5d8..46a028fe 100644 --- a/SRC/timer.h +++ b/SRC/timer.h @@ -1,46 +1,3 @@ -/* - Copyright (c) 2012 The Regents of the University of California, - through Lawrence Berkeley National Laboratory. - - Author: Edgar Solomonik and Mathias Jacquelin - - This file is part of Cyclops Tensor Framework (CTF) and PEXSI. All rights - reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - (3) Neither the name of the University of California, Lawrence Berkeley - National Laboratory, U.S. Dept. of Energy nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or - upgrades to the features, functionality or performance of the source code - ("Enhancements") to anyone; however, if you choose to make your Enhancements - available either publicly, or directly to Lawrence Berkeley National - Laboratory, without imposing a separate written license agreement for such - Enhancements, then you hereby grant the following license: a non-exclusive, - royalty-free perpetual license to install, use, modify, prepare derivative - works, incorporate into other computer software, distribute, and sublicense - such enhancements or derivative works thereof, in binary and source code form. -*/ /// @file timer.h /// @brief Profiling and timing using TAU /// @date 2013-09-06 diff --git a/SRC/util.c b/SRC/util.c index 2f954dc9..549dbf33 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -143,6 +143,64 @@ Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) SUPERLU_FREE(Llu->bsendx_plist); SUPERLU_FREE(Llu->mod_bit); + + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Lindval_loc_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]); + } + SUPERLU_FREE(Llu->Lindval_loc_bc_ptr); + + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i]); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i]); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i]); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i]); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLinv_bc_ptr[i]!=NULL){ + SUPERLU_FREE(Llu->Linv_bc_ptr[i]); + } + if(Llu->Uinv_bc_ptr[i]!=NULL){ + SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); + } + } + SUPERLU_FREE(Llu->Linv_bc_ptr); + SUPERLU_FREE(Llu->Uinv_bc_ptr); + + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Urbs[i] ) { + SUPERLU_FREE(Llu->Ucb_indptr[i]); + SUPERLU_FREE(Llu->Ucb_valptr[i]); + } + SUPERLU_FREE(Llu->Ucb_indptr); + SUPERLU_FREE(Llu->Ucb_valptr); + SUPERLU_FREE(Llu->Urbs); + + SUPERLU_FREE(Glu_persist->xsup); SUPERLU_FREE(Glu_persist->supno); @@ -630,16 +688,24 @@ PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *gri printf("**************************************************\n"); } + double *utime1,*utime2,*utime3,*utime4; + flops_t *ops1; #if ( PROFlevel>=1 ) - fflush(stdout); - sleep(2.0); MPI_Barrier( grid->comm ); { int_t i, P = grid->nprow*grid->npcol; flops_t b, maxflop; + + + if ( !iam )utime1=doubleMalloc_dist(P); + if ( !iam )utime2=doubleMalloc_dist(P); + if ( !iam )utime3=doubleMalloc_dist(P); + if ( !iam )utime4=doubleMalloc_dist(P); + if ( !iam )ops1=(flops_t *) SUPERLU_MALLOC(P * sizeof(flops_t)); + // fflush(stdout); // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n"); // fflush(stdout); @@ -659,70 +725,56 @@ PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *gri MPI_Barrier( grid->comm ); if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); - fflush(stdout); + + MPI_Gather(&utime[COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[FACT], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); + if ( !iam ) for (i = 0; i < P; ++i) { - if ( iam == i) { - printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); + printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]); } fflush(stdout); - sleep(2.0); - MPI_Barrier( grid->comm ); + MPI_Barrier( grid->comm ); + if ( !iam ) printf("\n.. FACT ops distribution:\n"); - fflush(stdout); - MPI_Barrier( grid->comm ); + MPI_Gather(&ops[FACT], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); + + if ( !iam ) for (i = 0; i < P; ++i) { - if ( iam == i ) { - printf("\t\t(%d)\t%e\n", iam, ops[FACT]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); + printf("\t\t(%d)\t%e\n", i, ops1[i]); } - MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); fflush(stdout); - sleep(2.0); - MPI_Barrier( grid->comm ); + MPI_Barrier( grid->comm ); + + MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if ( !iam ) { b = factflop/P/maxflop; printf("\tFACT load balance: %.2f\n", b); - fflush(stdout); } fflush(stdout); - sleep(2.0); - MPI_Barrier( grid->comm ); + MPI_Barrier( grid->comm ); + + if ( !iam ) printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n"); - fflush(stdout); - sleep(2.0); - MPI_Barrier( grid->comm ); + + MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE,utime3, 1 , MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_L], 1, MPI_DOUBLE,utime4, 1 , MPI_DOUBLE, 0, grid->comm); + if ( !iam ) for (i = 0; i < P; ++i) { - if ( iam == i) { - printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", iam,utime[SOL_COMM],utime[SOL_GEMM],utime[SOL_TRSM], utime[SOL_L]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); + printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i,utime1[i],utime2[i],utime3[i], utime4[i]); } fflush(stdout); - - - - sleep(2.0); - MPI_Barrier( grid->comm ); - if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); - fflush(stdout); - sleep(2.0); MPI_Barrier( grid->comm ); + + if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); + MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); + if ( !iam ) for (i = 0; i < P; ++i) { - if ( iam == i ) { - printf("\t\t%d\t%e\n", iam, ops[SOLVE]); - fflush(stdout); - } - MPI_Barrier( grid->comm ); + printf("\t\t%d\t%e\n", i, ops1[i]); } MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); - sleep(2.0); - MPI_Barrier( grid->comm ); if ( !iam ) { b = solveflop/P/maxflop; printf("\tSOLVE load balance: %.2f\n", b); @@ -730,6 +782,15 @@ PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *gri } } + + if ( !iam ){ + SUPERLU_FREE(utime1); + SUPERLU_FREE(utime2); + SUPERLU_FREE(utime3); + SUPERLU_FREE(utime4); + SUPERLU_FREE(ops1); + } + #endif /* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ diff --git a/build/batch_script_mpi.sh b/build/batch_script_mpi.sh index 6ae8b096..498386fb 100644 --- a/build/batch_script_mpi.sh +++ b/build/batch_script_mpi.sh @@ -116,7 +116,8 @@ OMP_NUM_THREADS=1 mkdir -p $MAT echo "export OMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE echo "export KMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export MKL_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE + echo "export MKL_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE + echo "export MPICH_MAX_THREAD_SAFETY=multiple" >> $TMP_BATCH_FILE echo "export NSUP=128" >> $TMP_BATCH_FILE echo "export NREL=20" >> $TMP_BATCH_FILE diff --git a/build/batch_script_mpi_runit.sh b/build/batch_script_mpi_runit.sh index 0e09ca13..8c42251f 100644 --- a/build/batch_script_mpi_runit.sh +++ b/build/batch_script_mpi_runit.sh @@ -21,7 +21,7 @@ EXIT_PARAM=2 CUR_DIR=`pwd` FILE_DIR=$CUR_DIR/EXAMPLE -INPUT_DIR=~/Edison/my_research/SuperLU/SuperLUDIST_Begin/build_bac/EXAMPLE +INPUT_DIR=/project/projectdirs/sparse/liuyangz/my_research/matrix FILE_NAME=pddrive FILE=$FILE_DIR/$FILE_NAME @@ -64,19 +64,30 @@ fi #nprows=(6 12 24 48 ) #npcols=(6 12 24 48 ) -nprows=(6 12 24 48 1 1 1 1 36 144 576 2304) -npcols=(6 12 24 48 36 144 576 2304 1 1 1 1) +#nprows=(6 12 24 48 1 1 1 1 36 144 576 2304) +#npcols=(6 12 24 48 36 144 576 2304 1 1 1 1) + +# nprows=(32 128 512 1 1 1 4 8 16) +# npcols=(1 1 1 32 128 512 8 16 32) + +#nprows=(2048 1 32) +#npcols=(1 2048 64) + -#nprows=( 576 2304) -#npcols=( 1 1) #nprows=(12 1 144) #npcols=(12 144 1) + +NREP=1 - -#nprows=(48) +#nprows=(4 8 16 32 45) +#npcols=(4 8 16 32 45) +#nprows=(32) #npcols=(48) + +nprows=(16) +npcols=(16) for ((i = 0; i < ${#npcols[@]}; i++)); do NROW=${nprows[i]} @@ -86,13 +97,14 @@ NCOL=${npcols[i]} CORE_VAL=`expr $NCOL \* $NROW` NODE_VAL=`expr $CORE_VAL / $CORES_PER_NODE` MOD_VAL=`expr $CORE_VAL % $CORES_PER_NODE` + if [[ $MOD_VAL -ne 0 ]] then NODE_VAL=`expr $NODE_VAL + 1` fi #PARTITION=debug PARTITION=regular -LICENSE=SCRATCH +ICENSE=SCRATCH TIME=00:20:00 if [[ $NERSC_HOST == edison ]] @@ -106,17 +118,24 @@ fi OMP_NUM_THREADS=1 +THREADS_PER_RANK=`expr 2 \* $OMP_NUM_THREADS` #for NSUP in 128 64 32 16 8 #do # for MAT in atmosmodl.rb nlpkkt80.mtx torso3.mtx Ga19As19H42.mtx A22.mtx cage13.rb - # for MAT in torso3.mtx + #for MAT in torso3.mtx # for MAT in matrix121.dat matrix211.dat tdr190k.dat tdr455k.dat nlpkkt80.mtx torso3.mtx helm2d03.mtx # for MAT in tdr190k.dat Ga19As19H42.mtx - for MAT in torso3.mtx - # for MAT in Ga19As19H42.mtx - do +# for MAT in big.rua +# for MAT in tdr455k.bin + # for MAT in A22.bin tdr455k.bin DG_GrapheneDisorder_8192.bin DNA_715_64cell.bin LU_C_BN_C_4by2.bin Li4244.bin atmosmodj.bin nlpkkt80.bin Ga19As19H42.bin Geo_1438.bin StocF-1465.bin cage13.bin +# for MAT in globalmat118_1536.bin +# for MAT in DG_PNF_14000.bin DG_GrapheneDisorder_32768.bin + # for MAT in DNA_715_64cell.mtx + # for MAT in Ga19As19H42.mtx cage13.rb Geo_1438.mtx nlpkkt80.mtx torso3.mtx helm2d03.mtx gsm_106857.mtx atmosmodj.mtx StocF-1465.mtx hvdc2.mtx + for MAT in Geo_1438.bin + do # Start of looping stuff > $TMP_BATCH_FILE echo "#!/bin/bash -l" >> $TMP_BATCH_FILE @@ -126,8 +145,8 @@ OMP_NUM_THREADS=1 echo "#SBATCH -t $TIME" >> $TMP_BATCH_FILE echo "#SBATCH -L $LICENSE" >> $TMP_BATCH_FILE echo "#SBATCH -J SLU_$MAT" >> $TMP_BATCH_FILE - #echo "#SBATCH -o ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_flat_mrhs" >> $TMP_BATCH_FILE - #echo "#SBATCH -e ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_flat_mrhs" >> $TMP_BATCH_FILE + #echo "#SBATCH -o ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_mrhs" >> $TMP_BATCH_FILE + #echo "#SBATCH -e ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_mrhs" >> $TMP_BATCH_FILE # echo "#SBATCH --mail-type=BEGIN" >> $TMP_BATCH_FILE # echo "#SBATCH --mail-type=END" >> $TMP_BATCH_FILE echo "#SBATCH --mail-user=liuyangzhuan@lbl.gov" >> $TMP_BATCH_FILE @@ -136,11 +155,15 @@ OMP_NUM_THREADS=1 echo "#SBATCH -C $CONSTRAINT" >> $TMP_BATCH_FILE fi mkdir -p $MAT - echo "export OMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export KMP_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export MKL_NUM_THREADS=$OMP_NUM_THREADS" >> $TMP_BATCH_FILE - echo "export NSUP=128" >> $TMP_BATCH_FILE - echo "export NREL=20" >> $TMP_BATCH_FILE + export OMP_NUM_THREADS=$OMP_NUM_THREADS + export KMP_NUM_THREADS=$OMP_NUM_THREADS + export MKL_NUM_THREADS=$OMP_NUM_THREADS + export NSUP=128 + export NREL=20 + + export OMP_PLACES=threads + export OMP_PROC_BIND=spread + export MPICH_MAX_THREAD_iSAFETY=multiple echo " " >> $TMP_BATCH_FILE echo "FILE=$FILE" >> $TMP_BATCH_FILE @@ -150,8 +173,12 @@ OMP_NUM_THREADS=1 echo "NCOL=$NCOL" >> $TMP_BATCH_FILE echo "NROW=$NROW" >> $TMP_BATCH_FILE # This should be computed individually for each script... - - srun -n $CORE_VAL $FILE -c $NCOL -r $NROW $INPUT_DIR/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_async_simple_over_icollec_groupgemm_bettertree +OUTPUT=./$MAT/SLU.o_mpi_${NROW}x${NCOL}_asyncU_cori + rm -rf $OUTPUT + for ii in `seq 1 $NREP` + do + srun -n $CORE_VAL -N $NODE_VAL -c $THREADS_PER_RANK --cpu_bind=cores $FILE -c $NCOL -r $NROW $INPUT_DIR/$MAT | tee -a $OUTPUT + done # Add final line (srun line) to temporary slurm script #cat $TMP_BATCH_FILE diff --git a/build/run_cmake_build.sh b/build/run_cmake_build.sh index 69456a47..80fefb87 100644 --- a/build/run_cmake_build.sh +++ b/build/run_cmake_build.sh @@ -1,25 +1,43 @@ +#!/bin/bash +# Bash script to submit many files to Cori/Edison/Queue +Vtune=0 export CRAYPE_LINK_TYPE=dynamic -export PARMETIS_ROOT=~/Edison/my_software/parmetis-4.0.3_dynamic +export PARMETIS_ROOT=~/Cori/my_software/parmetis-4.0.3_dynamic_longint export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 rm -rf CMakeCache.txt +rm -rf CMakeFiles +rm -rf CTestTestfile.cmake +rm -rf cmake_install.cmake +rm -rf DartConfiguration.tcl +if [[ ${Vtune} == 1 ]]; then +INC_VTUNE="-g -DVTUNE=1 -I$VTUNE_AMPLIFIER_XE_2017_DIR/include" +LIB_VTUNE="$VTUNE_AMPLIFIER_XE_2017_DIR/lib64/libittnotify.a" +fi + cmake .. \ -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ - -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so" \ - -DCMAKE_C_FLAGS="-std=c99 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0" \ - -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \ + -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.so;${PARMETIS_BUILD_DIR}/libmetis/libmetis.so;${LIB_VTUNE}" \ -Denable_blaslib=OFF \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_C_COMPILER=cc \ + -DCMAKE_CXX_COMPILER=CC \ -DCMAKE_INSTALL_PREFIX=. \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \ + -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE ${INC_VTUNE}" \ + -DCMAKE_C_FLAGS="-D_LONGINT -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 ${INC_VTUNE}" \ -DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so" +# -DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so" \ +# -DCMAKE_CXX_FLAGS="-g -trace -Ofast -std=c++11 -DAdd_ -DRELEASE -tcollect -L$VT_LIB_DIR -lVT $VT_ADD_LIBS" \ + + # -DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_lapack95_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_blas95_lp64.a" # -DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a" # DCMAKE_BUILD_TYPE=Release or Debug compiler options set in CMAKELIST.txt -# -DCMAKE_C_FLAGS="-g -O0 -std=c99 -DPRNTlevel=2 -DPROFlevel=1 -DDEBUGlevel=0" \ +# -DCMAKE_C_FLAGS="-g -O0 -std=c99 -DPRNTlevel=2 -DPROFlevel=1 -DDEBUGlevel=0" \ +# -DCMAKE_C_FLAGS="-g -O0 -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0" \ diff --git a/build/simpleconf.txt b/build/simpleconf.txt index 500ba31d..512e6ea4 100644 --- a/build/simpleconf.txt +++ b/build/simpleconf.txt @@ -1,3 +1,4 @@ +export MPICH_MAX_THREAD_SAFETY=multiple export OMP_NUM_THREADS=1 export KMP_NUM_THREADS=1 export MKL_NUM_THREADS=1 diff --git a/make.inc_good_dynamic b/make.inc_good_dynamic new file mode 100644 index 00000000..4a47aadc --- /dev/null +++ b/make.inc_good_dynamic @@ -0,0 +1,42 @@ +############################################################################ +# +# Program: SuperLU_DIST +# +# Module: make.inc +# +# Purpose: Top-level Definitions +# +# Creation date: March 1, 2016 version 5.0.0 +# +# Modified: +# +# +############################################################################ +# +# The name of the libraries to be created/linked to +# +SuperLUroot = /global/homes/l/liuyangz/Cori/my_research/github/superlu_dist_task_hybrid_whypddistslow_01_27_2018/build +DSUPERLULIB = $(SuperLUroot)/SRC/libsuperlu_dist.so + +LIBS = $(DSUPERLULIB) /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3_dynamic/build/Linux-x86_64/libparmetis/libparmetis.so /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3_dynamic/build/Linux-x86_64/libmetis/libmetis.so + +# +# The archiver and the flag(s) to use when building archive (library) +# If your system has no ranlib, set RANLIB = echo. +# +ARCH = /usr/bin/ar +ARCHFLAGS = cr +RANLIB = /usr/bin/ranlib + +CC = /opt/cray/pe/craype/2.5.12/bin/cc +CFLAGS = -O0 -g -I/global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3_dynamic/metis/include -I/global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3_dynamic/include -DUSE_VENDOR_BLAS -qopenmp -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 +# CFLAGS += -D_LONGINT ## 64-bit integer +# CFLAGS += -D +# CFLAGS += +NOOPTS = -O0 +FORTRAN = /opt/cray/pe/craype/2.5.12/bin/ftn +CPP =/opt/cray/pe/craype/2.5.12/bin/CC +CPPFLAGS = -Ofast -std=c++11 -DAdd_ + +LOADER = $(CPP) +LOADOPTS = $(LIBS) -Wl,-rpath,-qopenmp -qopenmp -dynamic diff --git a/make.inc_good_static b/make.inc_good_static new file mode 100644 index 00000000..e11d8899 --- /dev/null +++ b/make.inc_good_static @@ -0,0 +1,41 @@ +############################################################################ +# +# Program: SuperLU_DIST +# +# Module: make.inc +# +# Purpose: Top-level Definitions +# +# Creation date: March 1, 2016 version 5.0.0 +# +# Modified: +# +# +############################################################################ +# +# The name of the libraries to be created/linked to +# +SuperLUroot = /global/homes/l/liuyangz/Cori/my_research/github/superlu_dist_task_hybrid_whypddistslow_01_27_2018/build +DSUPERLULIB = $(SuperLUroot)/SRC/libsuperlu_dist.a + +LIBS = $(DSUPERLULIB) /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a /opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a /global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a + +# +# The archiver and the flag(s) to use when building archive (library) +# If your system has no ranlib, set RANLIB = echo. +# +ARCH = /usr/bin/ar +ARCHFLAGS = cr +RANLIB = /usr/bin/ranlib + +CC = /opt/cray/pe/craype/2.5.12/bin/cc +CFLAGS = -O3 -DNDEBUG -I/global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/metis/include -I/global/homes/l/liuyangz/Edison/my_software/parmetis-4.0.3/include -DUSE_VENDOR_BLAS -qopenmp -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 +# CFLAGS += -D_LONGINT ## 64-bit integer +# CFLAGS += -D +# CFLAGS += +NOOPTS = -O0 +FORTRAN = /opt/cray/pe/craype/2.5.12/bin/ftn +CPP =/opt/cray/pe/craype/2.5.12/bin/CC +CPPFLAGS = -Ofast -std=c++11 -DAdd_ +LOADER = $(CPP) +LOADOPTS = -Wl,-rpath,-qopenmp -qopenmp